mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-27 02:20:31 +08:00 
			
		
		
		
	 2613e9c5d0
			
		
	
	2613e9c5d0
	
	
	
		
			
			* [Android] support fastdeploy build with static deps(70Mb->17~19Mb) * [Android] support fastdeploy build with static deps(60Mb->29~30Mb) * fixed ci * fixed ci * [staticlib] support fd android static lib * [static] optimize bundle_static_library func * [staticlib] add api_helpers.h -> staticlib headers * [staticlib] add api_helpers.h -> staticlib headers * [staticlib] add api_helpers.h -> staticlib headers * [staticlib] Fixed Paddle Lite paddle_use_kernels.h to support fd armv7 static lib * [staticlib] Add strip -> fd static lib target * [staticlib] optimize bundle_static_library func * [staticlib] add strip for fd static lib on mac osx * [staticlib] move api_helpers -> lite/option * [staticlib] optimize bundle_static_library * [staticlib] add Android limit
		
			
				
	
	
		
			456 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			456 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| //     http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| #include "fastdeploy/core/fd_tensor.h"
 | |
| #include "fastdeploy/core/float16.h"
 | |
| #include "fastdeploy/utils/utils.h"
 | |
| 
 | |
| #include <algorithm>
 | |
| #include <cstring>
 | |
| #ifdef WITH_GPU
 | |
| #include <cuda_runtime_api.h>
 | |
| #endif
 | |
| 
 | |
| namespace fastdeploy {
 | |
| 
 | |
| void* FDTensor::MutableData() {
 | |
|   if (external_data_ptr != nullptr) {
 | |
|     return external_data_ptr;
 | |
|   }
 | |
|   return buffer_;
 | |
| }
 | |
| 
 | |
| void* FDTensor::Data() {
 | |
|   if (external_data_ptr != nullptr) {
 | |
|     return external_data_ptr;
 | |
|   }
 | |
|   return buffer_;
 | |
| }
 | |
| 
 | |
| const void* FDTensor::Data() const {
 | |
|   if (external_data_ptr != nullptr) {
 | |
|     return external_data_ptr;
 | |
|   }
 | |
|   return buffer_;
 | |
| }
 | |
| 
 | |
| void FDTensor::StopSharing() {
 | |
|   if (IsShared()) {
 | |
|     ReallocFn(Nbytes());
 | |
|     CopyBuffer(buffer_, external_data_ptr, Nbytes());
 | |
|     external_data_ptr = nullptr;
 | |
|   }
 | |
| }
 | |
| 
 | |
| const void* FDTensor::CpuData() const {
 | |
|   if (device == Device::GPU) {
 | |
| #ifdef WITH_GPU
 | |
|     auto* cpu_ptr = const_cast<std::vector<int8_t>*>(&temporary_cpu_buffer);
 | |
|     cpu_ptr->resize(Nbytes());
 | |
|     // need to copy cuda mem to cpu first
 | |
|     if (external_data_ptr != nullptr) {
 | |
|       FDASSERT(cudaMemcpy(cpu_ptr->data(), external_data_ptr, Nbytes(),
 | |
|                           cudaMemcpyDeviceToHost) == 0,
 | |
|                "[ERROR] Error occurs while copy memory from GPU to CPU");
 | |
| 
 | |
|     } else {
 | |
|       FDASSERT(cudaMemcpy(cpu_ptr->data(), buffer_, Nbytes(),
 | |
|                           cudaMemcpyDeviceToHost) == 0,
 | |
|                "[ERROR] Error occurs while buffer copy memory from GPU to CPU");
 | |
|     }
 | |
|     return cpu_ptr->data();
 | |
| #else
 | |
|     FDASSERT(false,
 | |
|              "The FastDeploy didn't compile under -DWITH_GPU=ON, so this is "
 | |
|              "an unexpected problem happend.");
 | |
| #endif
 | |
|   }
 | |
|   return Data();
 | |
| }
 | |
| 
 | |
| void FDTensor::SetExternalData(const std::vector<int64_t>& new_shape,
 | |
|                                const FDDataType& data_type, void* data_buffer,
 | |
|                                const Device& new_device, int new_device_id) {
 | |
|   dtype = data_type;
 | |
|   shape.assign(new_shape.begin(), new_shape.end());
 | |
|   external_data_ptr = data_buffer;
 | |
|   device = new_device;
 | |
|   device_id = new_device_id;
 | |
| }
 | |
| 
 | |
| void FDTensor::ExpandDim(int64_t axis) {
 | |
|   size_t ndim = shape.size();
 | |
|   FDASSERT(axis >= 0 && axis <= ndim,
 | |
|            "The allowed 'axis' must be in range of (0, %lu)!", ndim);
 | |
|   shape.insert(shape.begin() + axis, 1);
 | |
| }
 | |
| 
 | |
| void FDTensor::Squeeze(int64_t axis) {
 | |
|   size_t ndim = shape.size();
 | |
|   FDASSERT(axis >= 0 && axis < ndim,
 | |
|            "The allowed 'axis' must be in range of (0, %lu)!", ndim);
 | |
|   FDASSERT(shape[axis] == 1,
 | |
|            "The No.%ld dimension of shape should be 1, but it is %ld!",
 | |
|            (long)axis, (long)shape[axis]);
 | |
|   shape.erase(shape.begin() + axis);
 | |
| }
 | |
| 
 | |
| void FDTensor::Allocate(const std::vector<int64_t>& new_shape,
 | |
|                         const FDDataType& data_type,
 | |
|                         const std::string& tensor_name,
 | |
|                         const Device& new_device) {
 | |
|   dtype = data_type;
 | |
|   name = tensor_name;
 | |
|   shape.assign(new_shape.begin(), new_shape.end());
 | |
|   device = new_device;
 | |
|   size_t nbytes = Nbytes();
 | |
|   FDASSERT(ReallocFn(nbytes),
 | |
|            "The FastDeploy FDTensor allocate cpu memory error");
 | |
| }
 | |
| 
 | |
| int FDTensor::Nbytes() const { return Numel() * FDDataTypeSize(dtype); }
 | |
| 
 | |
| int FDTensor::Numel() const {
 | |
|   return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
 | |
| }
 | |
| 
 | |
| void FDTensor::Resize(size_t new_nbytes) { ReallocFn(new_nbytes); }
 | |
| 
 | |
| void FDTensor::Resize(const std::vector<int64_t>& new_shape) {
 | |
|   int numel = Numel();
 | |
|   int new_numel = std::accumulate(new_shape.begin(), new_shape.end(), 1,
 | |
|                                   std::multiplies<int>());
 | |
|   if (new_numel > numel || external_data_ptr != nullptr) {
 | |
|     size_t nbytes = new_numel * FDDataTypeSize(dtype);
 | |
|     ReallocFn(nbytes);
 | |
|   }
 | |
|   shape.assign(new_shape.begin(), new_shape.end());
 | |
|   external_data_ptr = nullptr;
 | |
| }
 | |
| 
 | |
| void FDTensor::Resize(const std::vector<int64_t>& new_shape,
 | |
|                       const FDDataType& data_type,
 | |
|                       const std::string& tensor_name,
 | |
|                       const Device& new_device) {
 | |
|   external_data_ptr = nullptr;
 | |
|   name = tensor_name;
 | |
|   device = new_device;
 | |
|   dtype = data_type;
 | |
|   int new_nbytes = std::accumulate(new_shape.begin(), new_shape.end(), 1,
 | |
|                                    std::multiplies<int>()) *
 | |
|                    FDDataTypeSize(data_type);
 | |
|   ReallocFn(new_nbytes);
 | |
|   shape.assign(new_shape.begin(), new_shape.end());
 | |
| }
 | |
| 
 | |
| bool FDTensor::Reshape(const std::vector<int64_t>& new_shape) {
 | |
|   int numel = Numel();
 | |
|   const int64_t unk_dim_val = -1;
 | |
|   const int64_t copy_dim_val = 0;
 | |
| 
 | |
|   std::vector<int64_t> output_shape(new_shape.size(), 0);
 | |
|   int64_t capacity = 1;
 | |
|   int unk_dim_idx = -1;
 | |
|   for (size_t i = 0; i < new_shape.size(); ++i) {
 | |
|     if (new_shape[i] == unk_dim_val) {
 | |
|       FDASSERT(unk_dim_idx == -1,
 | |
|                "Only one dimension value of 'shape' in ReshapeOp can "
 | |
|                "be -1. But received shape = [%s], shape[%d] is also -1.",
 | |
|                Str(new_shape).c_str(), i);
 | |
|       unk_dim_idx = i;
 | |
|     } else if (new_shape[i] == copy_dim_val) {
 | |
|       FDASSERT(i < shape.size(),
 | |
|                "The index of 0 in `shape` must be less than "
 | |
|                "the input tensor X's dimensions. "
 | |
|                "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
 | |
|                "X's dimensions = %d.",
 | |
|                Str(new_shape).c_str(), i, Str(shape).c_str(), shape.size());
 | |
|     } else {
 | |
|       FDASSERT(new_shape[i] > 0,
 | |
|                "Each dimension value of 'shape' in ReshapeOp must not "
 | |
|                "be negative except one unknown dimension. "
 | |
|                "But received  shape = [%s], shape[%d] = %d.",
 | |
|                Str(new_shape).c_str(), i, new_shape[i]);
 | |
|     }
 | |
|     capacity *= (new_shape[i] ? new_shape[i] : shape[i]);
 | |
|     output_shape[i] = (new_shape[i] ? new_shape[i] : shape[i]);
 | |
|   }
 | |
|   if (unk_dim_idx != -1) {
 | |
|     output_shape[unk_dim_idx] = -numel / capacity;
 | |
|     FDASSERT(output_shape[unk_dim_idx] * capacity == -numel,
 | |
|              "The 'shape' attribute in ReshapeOp is invalid. "
 | |
|              "The input tensor X'size must be divisible by known "
 | |
|              "capacity of 'shape'. "
 | |
|              "But received X's shape = [%s], X's size = %d, "
 | |
|              "'shape' is [%s], known capacity of 'shape' is %d.",
 | |
|              Str(shape).c_str(), numel, Str(new_shape).c_str(), capacity);
 | |
|   } else {
 | |
|     FDASSERT(numel == capacity,
 | |
|              "The 'shape' in ReshapeOp is invalid. "
 | |
|              "The input tensor X'size must be equal to the capacity of "
 | |
|              "'shape'. "
 | |
|              "But received X's shape = [%s], X's size = %d, 'shape' is "
 | |
|              "[%s], the capacity of 'shape' is %d.",
 | |
|              Str(shape).c_str(), numel, Str(shape).c_str(), capacity);
 | |
|   }
 | |
|   shape = output_shape;
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| template <typename T>
 | |
| void CalculateStatisInfo(const void* src_ptr, int size, double* mean,
 | |
|                          double* max, double* min) {
 | |
|   const T* ptr = static_cast<const T*>(src_ptr);
 | |
|   *mean = 0;
 | |
|   *max = -99999999;
 | |
|   *min = 99999999;
 | |
|   for (int i = 0; i < size; ++i) {
 | |
|     if (*(ptr + i) > *max) {
 | |
|       *max = *(ptr + i);
 | |
|     }
 | |
|     if (*(ptr + i) < *min) {
 | |
|       *min = *(ptr + i);
 | |
|     }
 | |
|     *mean += *(ptr + i);
 | |
|   }
 | |
|   *mean = *mean / size;
 | |
| }
 | |
| 
 | |
| void FDTensor::PrintInfo(const std::string& prefix) const {
 | |
|   double mean = 0;
 | |
|   double max = -99999999;
 | |
|   double min = 99999999;
 | |
|   if (dtype == FDDataType::FP32) {
 | |
|     CalculateStatisInfo<float>(CpuData(), Numel(), &mean, &max, &min);
 | |
|   } else if (dtype == FDDataType::FP64) {
 | |
|     CalculateStatisInfo<double>(CpuData(), Numel(), &mean, &max, &min);
 | |
|   } else if (dtype == FDDataType::INT8) {
 | |
|     CalculateStatisInfo<int8_t>(CpuData(), Numel(), &mean, &max, &min);
 | |
|   } else if (dtype == FDDataType::UINT8) {
 | |
|     CalculateStatisInfo<uint8_t>(CpuData(), Numel(), &mean, &max, &min);
 | |
|   } else if (dtype == FDDataType::INT32) {
 | |
|     CalculateStatisInfo<int32_t>(CpuData(), Numel(), &mean, &max, &min);
 | |
|   } else if (dtype == FDDataType::INT64) {
 | |
|     CalculateStatisInfo<int64_t>(CpuData(), Numel(), &mean, &max, &min);
 | |
|   } else if (dtype == FDDataType::FP16) {
 | |
|     CalculateStatisInfo<float16>(CpuData(), Numel(), &mean, &max, &min);
 | |
|   } else {
 | |
|     FDASSERT(false,
 | |
|              "PrintInfo function doesn't support current situation, maybe you "
 | |
|              "need enhance this function now.");
 | |
|   }
 | |
|   std::cout << prefix << ": name=" << name << ", shape=";
 | |
|   for (int i = 0; i < shape.size(); ++i) {
 | |
|     std::cout << shape[i] << " ";
 | |
|   }
 | |
|   std::cout << ", dtype=" << Str(dtype) << ", mean=" << mean << ", max=" << max
 | |
|             << ", min=" << min << std::endl;
 | |
| }
 | |
| 
 | |
| bool FDTensor::ReallocFn(size_t nbytes) {
 | |
|   if (device == Device::GPU) {
 | |
| #ifdef WITH_GPU
 | |
|     size_t original_nbytes = Nbytes();
 | |
|     if (nbytes > original_nbytes) {
 | |
|       if (buffer_ != nullptr) {
 | |
|         FDDeviceFree()(buffer_);
 | |
|       }
 | |
|       FDDeviceAllocator()(&buffer_, nbytes);
 | |
|     }
 | |
|     return buffer_ != nullptr;
 | |
| #else
 | |
|     FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
 | |
|                     "-DWITH_GPU=ON,"
 | |
|                     "so this is an unexpected problem happend.");
 | |
| #endif
 | |
|   } else {
 | |
|     if (is_pinned_memory) {
 | |
| #ifdef WITH_GPU
 | |
|       size_t original_nbytes = Nbytes();
 | |
|       if (nbytes > original_nbytes) {
 | |
|         if (buffer_ != nullptr) {
 | |
|           FDDeviceHostFree()(buffer_);
 | |
|         }
 | |
|         FDDeviceHostAllocator()(&buffer_, nbytes);
 | |
|       }
 | |
|       return buffer_ != nullptr;
 | |
| #else
 | |
|       FDASSERT(false, "The FastDeploy FDTensor allocator didn't compile under "
 | |
|                       "-DWITH_GPU=ON,"
 | |
|                       "so this is an unexpected problem happend.");
 | |
| #endif
 | |
|     }
 | |
|     buffer_ = realloc(buffer_, nbytes);
 | |
|     return buffer_ != nullptr;
 | |
|   }
 | |
| }
 | |
| 
 | |
| void FDTensor::FreeFn() {
 | |
|   if (external_data_ptr != nullptr)
 | |
|     external_data_ptr = nullptr;
 | |
|   if (buffer_ != nullptr) {
 | |
|     if (device == Device::GPU) {
 | |
| #ifdef WITH_GPU
 | |
|       FDDeviceFree()(buffer_);
 | |
| #endif
 | |
|     } else {
 | |
|       if (is_pinned_memory) {
 | |
| #ifdef WITH_GPU
 | |
|         FDDeviceHostFree()(buffer_);
 | |
| #endif
 | |
|       } else {
 | |
|         FDHostFree()(buffer_);
 | |
|       }
 | |
|     }
 | |
|     buffer_ = nullptr;
 | |
|   }
 | |
| }
 | |
| 
 | |
| // TODO(liqi): no src_device and dst_device
 | |
| // should support copy from cpu or gpu  to cpu or gpu
 | |
| void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes,
 | |
|                           const Device& device, bool is_pinned_memory) {
 | |
|   if (device == Device::GPU) {
 | |
| #ifdef WITH_GPU
 | |
|     FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToDevice) == 0,
 | |
|              "[ERROR] Error occurs while copy memory from GPU to GPU");
 | |
| #else
 | |
|     FDASSERT(false,
 | |
|              "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
 | |
|              "gpu buffer is "
 | |
|              "an unexpected problem happend.");
 | |
| #endif
 | |
|   } else {
 | |
|     if (is_pinned_memory) {
 | |
| #ifdef WITH_GPU
 | |
|       FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToHost) == 0,
 | |
|                "[ERROR] Error occurs while copy memory from host to host");
 | |
| #else
 | |
|       FDASSERT(false,
 | |
|                "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
 | |
|                "gpu buffer is "
 | |
|                "an unexpected problem happend.");
 | |
| #endif
 | |
|     } else {
 | |
|       std::memcpy(dst, src, nbytes);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| FDTensor::FDTensor(const std::string& tensor_name) { name = tensor_name; }
 | |
| FDTensor::FDTensor(const char* tensor_name) { name = tensor_name; }
 | |
| 
 | |
| FDTensor::FDTensor(const Scalar& scalar) {
 | |
|   Allocate({1}, scalar.dtype());
 | |
|   switch (scalar.dtype()) {
 | |
|     case FDDataType::BOOL:
 | |
|       (reinterpret_cast<bool*>(Data()))[0] = scalar.to<bool>();
 | |
|       break;
 | |
|     case FDDataType::UINT8:
 | |
|       (reinterpret_cast<uint8_t*>(Data()))[0] = scalar.to<uint8_t>();
 | |
|       break;
 | |
|     case FDDataType::INT8:
 | |
|       (reinterpret_cast<int8_t*>(Data()))[0] = scalar.to<int8_t>();
 | |
|       break;
 | |
|     case FDDataType::INT16:
 | |
|       (reinterpret_cast<int16_t*>(Data()))[0] = scalar.to<int16_t>();
 | |
|       break;
 | |
|     case FDDataType::INT32:
 | |
|       (reinterpret_cast<int*>(Data()))[0] = scalar.to<int>();
 | |
|       break;
 | |
|     case FDDataType::INT64:
 | |
|       (reinterpret_cast<int64_t*>(Data()))[0] = scalar.to<int64_t>();
 | |
|       break;
 | |
|     case FDDataType::FP16:
 | |
|       (reinterpret_cast<float16*>(Data()))[0] = scalar.to<float16>();
 | |
|       break;
 | |
|     case FDDataType::FP32:
 | |
|       (reinterpret_cast<float*>(Data()))[0] = scalar.to<float>();
 | |
|       break;
 | |
|     case FDDataType::FP64:
 | |
|       (reinterpret_cast<double*>(Data()))[0] = scalar.to<double>();
 | |
|       break;
 | |
|     default:
 | |
|       break;  
 | |
|   }
 | |
| }
 | |
| 
 | |
| FDTensor::FDTensor(const FDTensor& other)
 | |
|     : shape(other.shape), name(other.name), dtype(other.dtype),
 | |
|       device(other.device), external_data_ptr(other.external_data_ptr),
 | |
|       device_id(other.device_id) {
 | |
|   // Copy buffer
 | |
|   if (other.buffer_ == nullptr) {
 | |
|     buffer_ = nullptr;
 | |
|   } else {
 | |
|     size_t nbytes = Nbytes();
 | |
|     FDASSERT(ReallocFn(nbytes),
 | |
|              "The FastDeploy FDTensor allocate memory error");
 | |
|     CopyBuffer(buffer_, other.buffer_, nbytes, device, is_pinned_memory);
 | |
|   }
 | |
| }
 | |
| 
 | |
| FDTensor::FDTensor(FDTensor&& other)
 | |
|     : buffer_(other.buffer_), shape(std::move(other.shape)),
 | |
|       name(std::move(other.name)), dtype(other.dtype),
 | |
|       external_data_ptr(other.external_data_ptr), device(other.device),
 | |
|       device_id(other.device_id) {
 | |
|   other.name = "";
 | |
|   // Note(zhoushunjie): Avoid double free.
 | |
|   other.buffer_ = nullptr;
 | |
|   other.external_data_ptr = nullptr;
 | |
| }
 | |
| 
 | |
| FDTensor& FDTensor::operator=(const FDTensor& other) {
 | |
|   if (&other != this) {
 | |
|     // Copy buffer
 | |
|     device_id = other.device_id;
 | |
|     if (other.buffer_ == nullptr) {
 | |
|       FreeFn();
 | |
|       buffer_ = nullptr;
 | |
|       shape = other.shape;
 | |
|       name = other.name;
 | |
|       dtype = other.dtype;
 | |
|       device = other.device;
 | |
|     } else {
 | |
|       Resize(other.shape, other.dtype, other.name, other.device);
 | |
|       size_t nbytes = Nbytes();
 | |
|       CopyBuffer(buffer_, other.buffer_, nbytes, device, is_pinned_memory);
 | |
|     }
 | |
|     external_data_ptr = other.external_data_ptr;
 | |
|   }
 | |
|   return *this;
 | |
| }
 | |
| 
 | |
| FDTensor& FDTensor::operator=(FDTensor&& other) {
 | |
|   if (&other != this) {
 | |
|     FreeFn();
 | |
|     buffer_ = other.buffer_;
 | |
|     external_data_ptr = other.external_data_ptr;
 | |
| 
 | |
|     shape = std::move(other.shape);
 | |
|     name = std::move(other.name);
 | |
|     dtype = other.dtype;
 | |
|     device = other.device;
 | |
|     device_id = other.device_id;
 | |
| 
 | |
|     other.name = "";
 | |
|     // Note(zhoushunjie): Avoid double free.
 | |
|     other.buffer_ = nullptr;
 | |
|     other.external_data_ptr = nullptr;
 | |
|   }
 | |
|   return *this;
 | |
| }
 | |
| 
 | |
| }  // namespace fastdeploy
 |