mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-31 03:46:40 +08:00 
			
		
		
		
	 5b54bcfb48
			
		
	
	5b54bcfb48
	
	
	
		
			
			* [NEON] fix visseg background error with bit ops * [Bug Fix] fix build_android_aar.sh ci scripts * [Bug Fix] fix VisSegmentationCommonCpu
		
			
				
	
	
		
			235 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			235 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| //     http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| #ifdef ENABLE_VISION_VISUALIZE
 | |
| 
 | |
| #include "fastdeploy/vision/visualize/visualize.h"
 | |
| #include "opencv2/highgui.hpp"
 | |
| #include "opencv2/imgproc/imgproc.hpp"
 | |
| #ifdef __ARM_NEON
 | |
| #include <arm_neon.h>
 | |
| #endif
 | |
| 
 | |
| namespace fastdeploy {
 | |
| namespace vision {
 | |
| 
 | |
| #ifdef __ARM_NEON  
 | |
| static constexpr int VIS_SEG_OMP_NUM_THREADS = 2;
 | |
| 
 | |
| static inline void QuantizeBlendingWeight8(
 | |
|   float weight, uint8_t* old_multi_factor, uint8_t* new_multi_factor) {
 | |
|   // Quantize the weight to boost blending performance.
 | |
|   // if 0.0 < w <= 1/8, w ~ 1/8=1/(2^3) shift right 3 mul 1, 7
 | |
|   // if 1/8 < w <= 2/8, w ~ 2/8=1/(2^3) shift right 3 mul 2, 6
 | |
|   // if 2/8 < w <= 3/8, w ~ 3/8=1/(2^3) shift right 3 mul 3, 5
 | |
|   // if 3/8 < w <= 4/8, w ~ 4/8=1/(2^3) shift right 3 mul 4, 4
 | |
|   // Shift factor is always 3, but the mul factor is different.
 | |
|   // Moving 7 bits to the right tends to result in a zero value,
 | |
|   // So, We choose to shift 3 bits to get an approximation.
 | |
|   uint8_t weight_quantize = static_cast<uint8_t>(weight * 8.0f);
 | |
|   *new_multi_factor = weight_quantize;
 | |
|   *old_multi_factor = (8 - weight_quantize);
 | |
| }
 | |
| 
 | |
| static cv::Mat FastVisSegmentationNEON(
 | |
|   const cv::Mat& im, const SegmentationResult& result,
 | |
|   float weight, bool quantize_weight = true) {
 | |
|   int64_t height = result.shape[0];
 | |
|   int64_t width = result.shape[1];
 | |
|   auto vis_img = cv::Mat(height, width, CV_8UC3);
 | |
|   
 | |
|   int32_t size = static_cast<int32_t>(height * width);
 | |
|   uint8_t *vis_ptr = static_cast<uint8_t*>(vis_img.data);
 | |
|   const uint8_t *label_ptr = static_cast<const uint8_t*>(result.label_map.data());
 | |
|   const uint8_t *im_ptr = static_cast<const uint8_t*>(im.data);
 | |
| 
 | |
|   if (!quantize_weight) {
 | |
|     uint8x16_t zerox16 = vdupq_n_u8(0);
 | |
|     #pragma omp parallel for proc_bind(close) \
 | |
|     num_threads(VIS_SEG_OMP_NUM_THREADS) schedule(static)
 | |
|     for (int i = 0; i < size - 15; i += 16) {
 | |
|       uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3);  // 48 bytes
 | |
|       uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
 | |
|       uint8x16_t ibx16 = bgrx16x3.val[0];
 | |
|       uint8x16_t igx16 = bgrx16x3.val[1];
 | |
|       uint8x16_t irx16 = bgrx16x3.val[2];
 | |
|       // e.g 0b00000001 << 7 -> 0b10000000 128;
 | |
|       uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); 
 | |
|       uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); 
 | |
|       uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); 
 | |
|       uint8x16x3_t vbgrx16x3;
 | |
|       // Keep the pixels of input im if mask = 0
 | |
|       uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16);
 | |
|       vbgrx16x3.val[0] = vorrq_u8(vandq_u8(cezx16, ibx16), mbx16);
 | |
|       vbgrx16x3.val[1] = vorrq_u8(vandq_u8(cezx16, igx16), mgx16);
 | |
|       vbgrx16x3.val[2] = vorrq_u8(vandq_u8(cezx16, irx16), mrx16);
 | |
|       vst3q_u8(vis_ptr + i * 3, vbgrx16x3);
 | |
|     }
 | |
|     for (int i = size - 15; i < size; i++) {
 | |
|       uint8_t label = label_ptr[i];
 | |
|       vis_ptr[i * 3 + 0] = (label << 7); 
 | |
|       vis_ptr[i * 3 + 1] = (label << 4); 
 | |
|       vis_ptr[i * 3 + 2] = (label << 3); 
 | |
|     }
 | |
|     // Blend the colors use OpenCV
 | |
|     cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img);
 | |
|     return vis_img;
 | |
|   }
 | |
|   
 | |
|   // Quantize the weight to boost blending performance.
 | |
|   // After that, we can directly use shift instructions
 | |
|   // to blend the colors from input im and mask. Please 
 | |
|   // check QuantizeBlendingWeight8 for more details.
 | |
|   uint8_t old_multi_factor, new_multi_factor;
 | |
|   QuantizeBlendingWeight8(weight, &old_multi_factor,
 | |
|                           &new_multi_factor);     
 | |
|   if (new_multi_factor == 0) {
 | |
|     return im; // Only keep origin image.
 | |
|   }                                            
 | |
|   
 | |
|   if (new_multi_factor == 8) {
 | |
|     // Only keep mask, no need to blending with origin image.
 | |
|     #pragma omp parallel for proc_bind(close) \
 | |
|     num_threads(VIS_SEG_OMP_NUM_THREADS) schedule(static)
 | |
|     for (int i = 0; i < size - 15; i += 16) {
 | |
|       uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
 | |
|       // e.g 0b00000001 << 7 -> 0b10000000 128;
 | |
|       uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); 
 | |
|       uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); 
 | |
|       uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); 
 | |
|       uint8x16x3_t vbgr16x3;
 | |
|       vbgr16x3.val[0] = mbx16;
 | |
|       vbgr16x3.val[1] = mgx16;
 | |
|       vbgr16x3.val[2] = mrx16;
 | |
|       vst3q_u8(vis_ptr + i * 3, vbgr16x3);
 | |
|     }
 | |
|     for (int i = size - 15; i < size; i++) {
 | |
|       uint8_t label = label_ptr[i];
 | |
|       vis_ptr[i * 3 + 0] = (label << 7); 
 | |
|       vis_ptr[i * 3 + 1] = (label << 4); 
 | |
|       vis_ptr[i * 3 + 2] = (label << 3); 
 | |
|     }  
 | |
|     return vis_img;
 | |
|   }
 | |
|   
 | |
|   uint8x16_t zerox16 = vdupq_n_u8(0);
 | |
|   uint8x16_t old_fx16 = vdupq_n_u8(old_multi_factor);
 | |
|   uint8x16_t new_fx16 = vdupq_n_u8(new_multi_factor);
 | |
|   // Blend the two colors together with quantize 'weight'.
 | |
|   #pragma omp parallel for proc_bind(close) \
 | |
|   num_threads(VIS_SEG_OMP_NUM_THREADS) schedule(static)
 | |
|   for (int i = 0; i < size - 15; i += 16) {
 | |
|     uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3);  // 48 bytes
 | |
|     uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
 | |
|     uint8x16_t ibx16 = bgrx16x3.val[0];
 | |
|     uint8x16_t igx16 = bgrx16x3.val[1];
 | |
|     uint8x16_t irx16 = bgrx16x3.val[2];
 | |
|     // e.g 0b00000001 << 7 -> 0b10000000 128;
 | |
|     uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); 
 | |
|     uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); 
 | |
|     uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); 
 | |
|     // Moving 7 bits to the right tends to result in zero,
 | |
|     // So, We choose to shift 3 bits to get an approximation 
 | |
|     uint8x16_t ibx16_mshr = vmulq_u8(vshrq_n_u8(ibx16, 3), old_fx16);
 | |
|     uint8x16_t igx16_mshr = vmulq_u8(vshrq_n_u8(igx16, 3), old_fx16);   
 | |
|     uint8x16_t irx16_mshr = vmulq_u8(vshrq_n_u8(irx16, 3), old_fx16);
 | |
|     uint8x16_t mbx16_mshr = vmulq_u8(vshrq_n_u8(mbx16, 3), new_fx16);
 | |
|     uint8x16_t mgx16_mshr = vmulq_u8(vshrq_n_u8(mgx16, 3), new_fx16);
 | |
|     uint8x16_t mrx16_mshr = vmulq_u8(vshrq_n_u8(mrx16, 3), new_fx16);  
 | |
|     uint8x16_t qbx16 = vqaddq_u8(ibx16_mshr, mbx16_mshr);
 | |
|     uint8x16_t qgx16 = vqaddq_u8(igx16_mshr, mgx16_mshr);
 | |
|     uint8x16_t qrx16 = vqaddq_u8(irx16_mshr, mrx16_mshr);
 | |
|     // Keep the pixels of input im if label = 0 (means mask = 0)
 | |
|     uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16);
 | |
|     uint8x16_t abx16 = vandq_u8(cezx16, ibx16);
 | |
|     uint8x16_t agx16 = vandq_u8(cezx16, igx16);
 | |
|     uint8x16_t arx16 = vandq_u8(cezx16, irx16);
 | |
|     uint8x16x3_t vbgr16x3;  
 | |
|     // Reset qx values to 0 if label is 0, then, keep mask values 
 | |
|     // if label is not 0  
 | |
|     uint8x16_t ncezx16 = vmvnq_u8(cezx16); 
 | |
|     vbgr16x3.val[0] = vorrq_u8(abx16, vandq_u8(ncezx16, qbx16));
 | |
|     vbgr16x3.val[1] = vorrq_u8(agx16, vandq_u8(ncezx16, qgx16));
 | |
|     vbgr16x3.val[2] = vorrq_u8(arx16, vandq_u8(ncezx16, qrx16));
 | |
|     // Store the blended pixels to vis img
 | |
|     vst3q_u8(vis_ptr + i * 3, vbgr16x3);
 | |
|   }
 | |
|   for (int i = size - 15; i < size; i++) {
 | |
|     uint8_t label = label_ptr[i];
 | |
|     vis_ptr[i * 3 + 0] = (im_ptr[i * 3 + 0] >> 3) * old_multi_factor 
 | |
|       + ((label << 7) >> 3) * new_multi_factor; 
 | |
|     vis_ptr[i * 3 + 1] = (im_ptr[i * 3 + 1] >> 3) * old_multi_factor 
 | |
|       + ((label << 4) >> 3) * new_multi_factor; 
 | |
|     vis_ptr[i * 3 + 2] = (im_ptr[i * 3 + 2] >> 3) * old_multi_factor 
 | |
|       + ((label << 3) >> 3) * new_multi_factor;   
 | |
|   }  
 | |
|   return vis_img;
 | |
| }
 | |
| #endif
 | |
| 
 | |
| static cv::Mat VisSegmentationCommonCpu(
 | |
|   const cv::Mat& im, const SegmentationResult& result,
 | |
|   float weight) {
 | |
|   // Use the native c++ version without any optimization.
 | |
|   auto color_map = GenerateColorMap(1000);
 | |
|   int64_t height = result.shape[0];
 | |
|   int64_t width = result.shape[1];
 | |
|   auto vis_img = cv::Mat(height, width, CV_8UC3);
 | |
| 
 | |
|   int64_t index = 0;
 | |
|   for (int i = 0; i < height; i++) {
 | |
|     for (int j = 0; j < width; j++) {
 | |
|       int category_id = result.label_map[index++];
 | |
|       if (category_id == 0) {
 | |
|         vis_img.at<cv::Vec3b>(i, j)[0] = im.at<cv::Vec3b>(i, j)[0];
 | |
|         vis_img.at<cv::Vec3b>(i, j)[1] = im.at<cv::Vec3b>(i, j)[1];
 | |
|         vis_img.at<cv::Vec3b>(i, j)[2] = im.at<cv::Vec3b>(i, j)[2];
 | |
|       } else {
 | |
|         vis_img.at<cv::Vec3b>(i, j)[0] = color_map[3 * category_id + 0];
 | |
|         vis_img.at<cv::Vec3b>(i, j)[1] = color_map[3 * category_id + 1];
 | |
|         vis_img.at<cv::Vec3b>(i, j)[2] = color_map[3 * category_id + 2];
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img);
 | |
|   return vis_img;
 | |
| }
 | |
| 
 | |
| cv::Mat VisSegmentation(const cv::Mat& im, const SegmentationResult& result,
 | |
|                         float weight) {
 | |
|   // TODO: Support SSE/AVX on x86_64 platforms                        
 | |
| #ifdef __ARM_NEON 
 | |
|   return FastVisSegmentationNEON(im, result, weight, true);
 | |
| #else  
 | |
|   return VisSegmentationCommonCpu(im, result, weight);
 | |
| #endif  
 | |
| }
 | |
| 
 | |
| cv::Mat Visualize::VisSegmentation(const cv::Mat& im,
 | |
|                                    const SegmentationResult& result) {
 | |
|   FDWARNING << "DEPRECATED: fastdeploy::vision::Visualize::VisSegmentation is "
 | |
|                "deprecated, please use fastdeploy::vision:VisSegmentation "
 | |
|                "function instead."
 | |
|             << std::endl;     
 | |
| #ifdef __ARM_NEON 
 | |
|   return FastVisSegmentationNEON(im, result, 0.5f, true);
 | |
| #else  
 | |
|   return VisSegmentationCommonCpu(im, result, 0.5f);
 | |
| #endif  
 | |
| }
 | |
| 
 | |
| }  // namespace vision
 | |
| }  // namespace fastdeploy
 | |
| #endif
 |