diff --git a/CMakeLists.txt b/CMakeLists.txt index 2675b1f97..d2843d8ef 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -480,10 +480,6 @@ else() set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS_RELEASE -s) endif() -#find_package(OpenMP) -#if(OpenMP_CXX_FOUND) -# list(APPEND DEPEND_LIBS OpenMP::OpenMP_CXX) -#endif() set_target_properties(${LIBRARY_NAME} PROPERTIES VERSION ${FASTDEPLOY_VERSION}) if(MSVC) # disable warnings for dll export @@ -493,6 +489,10 @@ endif() if (ANDROID) find_library(log-lib log) list(APPEND DEPEND_LIBS ${log-lib}) + find_package(OpenMP) + if(OpenMP_CXX_FOUND) + list(APPEND DEPEND_LIBS OpenMP::OpenMP_CXX) + endif() endif() target_link_libraries(${LIBRARY_NAME} ${DEPEND_LIBS}) diff --git a/fastdeploy/vision/visualize/segmentation.cc b/fastdeploy/vision/visualize/segmentation.cc index 8ac1f1ac8..43f857250 100644 --- a/fastdeploy/vision/visualize/segmentation.cc +++ b/fastdeploy/vision/visualize/segmentation.cc @@ -17,12 +17,147 @@ #include "fastdeploy/vision/visualize/visualize.h" #include "opencv2/highgui.hpp" #include "opencv2/imgproc/imgproc.hpp" +#ifdef __ARM_NEON +#include +#endif namespace fastdeploy { namespace vision { -cv::Mat VisSegmentation(const cv::Mat& im, const SegmentationResult& result, - float weight) { +#ifdef __ARM_NEON +static inline void QuantizeBlendingWeight8( + float weight, uint8_t* old_multi_factor, uint8_t* new_multi_factor) { + // Quantize the weight to boost blending performance. + // if 0.0 < w <= 1/8, w ~ 1/8=1/(2^3) shift right 3 mul 1, 7 + // if 1/8 < w <= 2/8, w ~ 2/8=1/(2^3) shift right 3 mul 2, 6 + // if 2/8 < w <= 3/8, w ~ 3/8=1/(2^3) shift right 3 mul 3, 5 + // if 3/8 < w <= 4/8, w ~ 4/8=1/(2^3) shift right 3 mul 4, 4 + // Shift factor is always 3, but the mul factor is different. + // Moving 7 bits to the right tends to result in a zero value, + // So, We choose to shift 3 bits to get an approximation. + uint8_t weight_quantize = static_cast(weight * 8.0f); + *new_multi_factor = weight_quantize; + *old_multi_factor = (8 - weight_quantize); +} + +static cv::Mat FastVisSegmentationNEON( + const cv::Mat& im, const SegmentationResult& result, + float weight, bool quantize_weight = true) { + int64_t height = result.shape[0]; + int64_t width = result.shape[1]; + auto vis_img = cv::Mat(height, width, CV_8UC3); + + int32_t size = static_cast(height * width); + uint8_t *vis_ptr = static_cast(vis_img.data); + const uint8_t *label_ptr = static_cast(result.label_map.data()); + const uint8_t *im_ptr = static_cast(im.data); + + if (!quantize_weight) { + #pragma omp parallel for num_threads(2) schedule(static) + for (int i = 0; i < size - 15; i += 16) { + uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes + // e.g 0b00000001 << 7 -> 0b10000000 128; + uint8x16x3_t vbgrx16x3; + vbgrx16x3.val[0] = vshlq_n_u8(labelx16, 7); + vbgrx16x3.val[1] = vshlq_n_u8(labelx16, 4); + vbgrx16x3.val[2] = vshlq_n_u8(labelx16, 3); + vst3q_u8(vis_ptr + i * 3, vbgrx16x3); + } + for (int i = size - 15; i < size; i++) { + uint8_t label = label_ptr[i]; + vis_ptr[i * 3 + 0] = (label << 7); + vis_ptr[i * 3 + 1] = (label << 4); + vis_ptr[i * 3 + 2] = (label << 3); + } + // Blend colors use opencv + cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img); + return vis_img; + } + + // Quantize the weight to boost blending performance. + // After that, we can directly use shift instructions + // to blend the colors from input im and mask. Please + // check QuantizeBlendingWeight8 for more details. + uint8_t old_multi_factor, new_multi_factor; + QuantizeBlendingWeight8(weight, &old_multi_factor, + &new_multi_factor); + if (new_multi_factor == 0) { + return im; // Only keep origin image. + } + + if (new_multi_factor == 8) { + // Only keep mask, no need to blending with origin image. + #pragma omp parallel for num_threads(2) schedule(static) + for (int i = 0; i < size - 15; i += 16) { + uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes + // e.g 0b00000001 << 7 -> 0b10000000 128; + uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); + uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); + uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); + uint8x16x3_t vbgr16x3; + vbgr16x3.val[0] = mbx16; + vbgr16x3.val[1] = mgx16; + vbgr16x3.val[2] = mrx16; + vst3q_u8(vis_ptr + i * 3, vbgr16x3); + } + for (int i = size - 15; i < size; i++) { + uint8_t label = label_ptr[i]; + vis_ptr[i * 3 + 0] = (label << 7); + vis_ptr[i * 3 + 1] = (label << 4); + vis_ptr[i * 3 + 2] = (label << 3); + } + return vis_img; + } + + uint8x16_t old_mulx16 = vdupq_n_u8(old_multi_factor); + uint8x16_t new_mulx16 = vdupq_n_u8(new_multi_factor); + // Blend the two colors together with quantize 'weight'. + #pragma omp parallel for num_threads(2) schedule(static) + for (int i = 0; i < size - 15; i += 16) { + uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3); // 48 bytes + uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes + uint8x16_t ibx16 = bgrx16x3.val[0]; + uint8x16_t igx16 = bgrx16x3.val[1]; + uint8x16_t irx16 = bgrx16x3.val[2]; + // e.g 0b00000001 << 7 -> 0b10000000 128; + uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); + uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); + uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); + // TODO: keep the pixels of input im if mask = 0 + uint8x16_t ibx16_mshr, igx16_mshr, irx16_mshr; + uint8x16_t mbx16_mshr, mgx16_mshr, mrx16_mshr; + // Moving 7 bits to the right tends to result in zero, + // So, We choose to shift 3 bits to get an approximation + ibx16_mshr = vmulq_u8(vshrq_n_u8(ibx16, 3), old_mulx16); + igx16_mshr = vmulq_u8(vshrq_n_u8(igx16, 3), old_mulx16); + irx16_mshr = vmulq_u8(vshrq_n_u8(irx16, 3), old_mulx16); + mbx16_mshr = vmulq_u8(vshrq_n_u8(mbx16, 3), new_mulx16); + mgx16_mshr = vmulq_u8(vshrq_n_u8(mgx16, 3), new_mulx16); + mrx16_mshr = vmulq_u8(vshrq_n_u8(mrx16, 3), new_mulx16); + uint8x16x3_t vbgr16x3; + vbgr16x3.val[0] = vaddq_u8(ibx16_mshr, mbx16_mshr); + vbgr16x3.val[1] = vaddq_u8(igx16_mshr, mgx16_mshr); + vbgr16x3.val[2] = vaddq_u8(irx16_mshr, mrx16_mshr); + // Store the blended pixels to vis img + vst3q_u8(vis_ptr + i * 3, vbgr16x3); + } + for (int i = size - 15; i < size; i++) { + uint8_t label = label_ptr[i]; + vis_ptr[i * 3 + 0] = (im_ptr[i * 3 + 0] >> 3) * old_multi_factor + + ((label << 7) >> 3) * new_multi_factor; + vis_ptr[i * 3 + 1] = (im_ptr[i * 3 + 1] >> 3) * old_multi_factor + + ((label << 4) >> 3) * new_multi_factor; + vis_ptr[i * 3 + 2] = (im_ptr[i * 3 + 2] >> 3) * old_multi_factor + + ((label << 3) >> 3) * new_multi_factor; + } + return vis_img; +} +#endif + +static cv::Mat VisSegmentationCommonCpu( + const cv::Mat& im, const SegmentationResult& result, + float weight) { + // Use the native c++ version without any optimization. auto color_map = GenerateColorMap(1000); int64_t height = result.shape[0]; int64_t width = result.shape[1]; @@ -41,28 +176,27 @@ cv::Mat VisSegmentation(const cv::Mat& im, const SegmentationResult& result, return vis_img; } +cv::Mat VisSegmentation(const cv::Mat& im, const SegmentationResult& result, + float weight) { + // TODO: Support SSE/AVX on x86_64 platforms +#ifdef __ARM_NEON + return FastVisSegmentationNEON(im, result, weight, true); +#else + return VisSegmentationCommonCpu(im, result, weight); +#endif +} + cv::Mat Visualize::VisSegmentation(const cv::Mat& im, const SegmentationResult& result) { FDWARNING << "DEPRECATED: fastdeploy::vision::Visualize::VisSegmentation is " "deprecated, please use fastdeploy::vision:VisSegmentation " "function instead." - << std::endl; - auto color_map = GetColorMap(); - int64_t height = result.shape[0]; - int64_t width = result.shape[1]; - auto vis_img = cv::Mat(height, width, CV_8UC3); - - int64_t index = 0; - for (int i = 0; i < height; i++) { - for (int j = 0; j < width; j++) { - int category_id = result.label_map[index++]; - vis_img.at(i, j)[0] = color_map[3 * category_id + 0]; - vis_img.at(i, j)[1] = color_map[3 * category_id + 1]; - vis_img.at(i, j)[2] = color_map[3 * category_id + 2]; - } - } - cv::addWeighted(im, .5, vis_img, .5, 0, vis_img); - return vis_img; + << std::endl; +#ifdef __ARM_NEON + return FastVisSegmentationNEON(im, result, 0.5f, true); +#else + return VisSegmentationCommonCpu(im, result, 0.5f); +#endif } } // namespace vision diff --git a/fastdeploy/vision/visualize/visualize.cc b/fastdeploy/vision/visualize/visualize.cc index f6b5a7b4b..bf0fdcb88 100644 --- a/fastdeploy/vision/visualize/visualize.cc +++ b/fastdeploy/vision/visualize/visualize.cc @@ -18,9 +18,6 @@ namespace fastdeploy { namespace vision { -int Visualize::num_classes_ = 0; -std::vector Visualize::color_map_ = std::vector(); - static std::vector global_fd_vis_color_map = std::vector(); std::vector GenerateColorMap(int num_classes) { @@ -42,6 +39,10 @@ std::vector GenerateColorMap(int num_classes) { return color_map; } +// This class will deprecated, please not use it +int Visualize::num_classes_ = 0; +std::vector Visualize::color_map_ = std::vector(); + const std::vector& Visualize::GetColorMap(int num_classes) { if (num_classes < num_classes_) { return color_map_;