/* 3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others) Written by Shaoshuai Shi All Rights Reserved 2019-2020. */ #include namespace fastdeploy { namespace paddle_custom_ops { // #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) __host__ __device__ static inline int DIVUP(const int m, const int n) { return ((m) / (n) + ((m) % (n) > 0)); } static const int THREADS_PER_BLOCK = 16; static const int THREADS_PER_BLOCK_NMS = sizeof(int64_t) * 8; #if defined(_WIN32) #if defined(EPS) #undef EPS #endif #define EPS 1e-8 #else static const float EPS = 1e-8; #endif struct Point { float x, y; __device__ Point() {} __device__ Point(double _x, double _y) { x = _x, y = _y; } __device__ void set(float _x, float _y) { x = _x; y = _y; } __device__ Point operator+(const Point &b) const { return Point(x + b.x, y + b.y); } __device__ Point operator-(const Point &b) const { return Point(x - b.x, y - b.y); } }; __device__ inline float cross(const Point &a, const Point &b) { return a.x * b.y - a.y * b.x; } __device__ inline float cross(const Point &p1, const Point &p2, const Point &p0) { return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); } __device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2) { int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && min(q1.x, q2.x) <= max(p1.x, p2.x) && min(p1.y, p2.y) <= max(q1.y, q2.y) && min(q1.y, q2.y) <= max(p1.y, p2.y); return ret; } __device__ inline int check_in_box2d(const float *box, const Point &p) { // params: (7) [x, y, z, dx, dy, dz, heading] const float MARGIN = 1e-2; // Align with the setting of mmdet3d // const float MARGIN = 1e-5; float center_x = box[0], center_y = box[1]; float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]); // rotate the point in the opposite direction of box float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN); // Align with the implement of mmdet3d // float rot_x = // (p.x - center_x) * angle_cos + (p.y - center_y) * angle_sin + center_x; // float rot_y = // -(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + // center_y; // float x1 = center_x - box[3] / 2; // float x2 = center_x + box[3] / 2; // float y1 = center_y - box[4] / 2; // float y2 = center_y + box[4] / 2; // return (rot_x > x1 - MARGIN && rot_x < x2 + MARGIN && rot_y > y1 - MARGIN // && // rot_y < y2 + MARGIN); } __device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans) { // fast exclusion if (check_rect_cross(p0, p1, q0, q1) == 0) return 0; // check cross standing float s1 = cross(q0, p1, p0); float s2 = cross(p1, q1, p0); float s3 = cross(p0, q1, q0); float s4 = cross(q1, p1, q0); if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0; // calculate intersection of two lines float s5 = cross(q1, p1, p0); if (fabs(s5 - s1) > EPS) { ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); } else { float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; float D = a0 * b1 - a1 * b0; ans.x = (b0 * c1 - b1 * c0) / D; ans.y = (a1 * c0 - a0 * c1) / D; } return 1; } __device__ inline void rotate_around_center(const Point ¢er, const float angle_cos, const float angle_sin, Point &p) { // float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * // (-angle_sin) + center.x; // float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + // center.y; // p.set(new_x, new_y); // Aligh with the implement of mmdet3d float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * angle_sin + center.x; float new_y = -(p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; p.set(new_x, new_y); } __device__ inline int point_cmp(const Point &a, const Point &b, const Point ¢er) { return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x); } __device__ inline float box_overlap(const float *box_a, const float *box_b) { // params box_a: [x, y, z, dx, dy, dz, heading] // params box_b: [x, y, z, dx, dy, dz, heading] float a_angle = box_a[6], b_angle = box_b[6]; float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2; float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half; float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half; float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half; float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half; Point center_a(box_a[0], box_a[1]); Point center_b(box_b[0], box_b[1]); #ifdef DEBUG printf( "a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle, b_x1, b_y1, b_x2, b_y2, b_angle); printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y); #endif Point box_a_corners[5]; box_a_corners[0].set(a_x1, a_y1); box_a_corners[1].set(a_x2, a_y1); box_a_corners[2].set(a_x2, a_y2); box_a_corners[3].set(a_x1, a_y2); Point box_b_corners[5]; box_b_corners[0].set(b_x1, b_y1); box_b_corners[1].set(b_x2, b_y1); box_b_corners[2].set(b_x2, b_y2); box_b_corners[3].set(b_x1, b_y2); // get oriented corners float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); for (int k = 0; k < 4; k++) { #ifdef DEBUG printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); #endif rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); #ifdef DEBUG printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); #endif } box_a_corners[4] = box_a_corners[0]; box_b_corners[4] = box_b_corners[0]; // get intersection of lines Point cross_points[16]; Point poly_center; int cnt = 0, flag = 0; poly_center.set(0, 0); for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]); if (flag) { poly_center = poly_center + cross_points[cnt]; cnt++; #ifdef DEBUG printf( "Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, " "%.3f)->(%.3f, %.3f) \n", cross_points[cnt - 1].x, cross_points[cnt - 1].y, box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x, box_a_corners[i + 1].y, box_b_corners[i].x, box_b_corners[i].y, box_b_corners[i + 1].x, box_b_corners[i + 1].y); #endif } } } // check corners for (int k = 0; k < 4; k++) { if (check_in_box2d(box_a, box_b_corners[k])) { poly_center = poly_center + box_b_corners[k]; cross_points[cnt] = box_b_corners[k]; cnt++; #ifdef DEBUG printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y); #endif } if (check_in_box2d(box_b, box_a_corners[k])) { poly_center = poly_center + box_a_corners[k]; cross_points[cnt] = box_a_corners[k]; cnt++; #ifdef DEBUG printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y); #endif } } poly_center.x /= cnt; poly_center.y /= cnt; // sort the points of polygon Point temp; for (int j = 0; j < cnt - 1; j++) { for (int i = 0; i < cnt - j - 1; i++) { if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) { temp = cross_points[i]; cross_points[i] = cross_points[i + 1]; cross_points[i + 1] = temp; } } } #ifdef DEBUG printf("cnt=%d\n", cnt); for (int i = 0; i < cnt; i++) { printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y); } #endif // get the overlap areas float area = 0; for (int k = 0; k < cnt - 1; k++) { area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]); } return fabs(area) / 2.0; } __device__ inline float iou_bev(const float *box_a, const float *box_b) { // params box_a: [x, y, z, dx, dy, dz, heading] // params box_b: [x, y, z, dx, dy, dz, heading] float sa = box_a[3] * box_a[4]; float sb = box_b[3] * box_b[4]; float s_overlap = box_overlap(box_a, box_b); return s_overlap / fmaxf(sa + sb - s_overlap, EPS); } __global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap) { // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; if (a_idx >= num_a || b_idx >= num_b) { return; } const float *cur_box_a = boxes_a + a_idx * 7; const float *cur_box_b = boxes_b + b_idx * 7; float s_overlap = box_overlap(cur_box_a, cur_box_b); ans_overlap[a_idx * num_b + b_idx] = s_overlap; } __global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou) { // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; if (a_idx >= num_a || b_idx >= num_b) { return; } const float *cur_box_a = boxes_a + a_idx * 7; const float *cur_box_b = boxes_b + b_idx * 7; float cur_iou_bev = iou_bev(cur_box_a, cur_box_b); ans_iou[a_idx * num_b + b_idx] = cur_iou_bev; } __global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh, const float *boxes, unsigned long long *mask) { // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] // params: mask (N, N/THREADS_PER_BLOCK_NMS) const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const float *cur_box = boxes + cur_box_idx * 7; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { t |= 1ULL << i; } } int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); mask[cur_box_idx * col_blocks + col_start] = t; } } __device__ inline float iou_normal(float const *const a, float const *const b) { // params: a: [x, y, z, dx, dy, dz, heading] // params: b: [x, y, z, dx, dy, dz, heading] float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2); float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2); float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); float interS = width * height; float Sa = a[3] * a[4]; float Sb = b[3] * b[4]; return interS / fmaxf(Sa + Sb - interS, EPS); } __global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh, const float *boxes, unsigned long long *mask) { // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] // params: mask (N, N/THREADS_PER_BLOCK_NMS) const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const float *cur_box = boxes + cur_box_idx * 7; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { t |= 1ULL << i; } } int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); mask[cur_box_idx * col_blocks + col_start] = t; } } void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap) { dim3 blocks( DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); boxes_overlap_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_overlap); #ifdef DEBUG cudaDeviceSynchronize(); // for using printf in kernel function #endif } void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou) { dim3 blocks( DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); boxes_iou_bev_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_iou); #ifdef DEBUG cudaDeviceSynchronize(); // for using printf in kernel function #endif } void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num, float nms_overlap_thresh) { dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); dim3 threads(THREADS_PER_BLOCK_NMS); nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); } void nmsNormalLauncher(const float *boxes, unsigned long long *mask, int boxes_num, float nms_overlap_thresh) { dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); dim3 threads(THREADS_PER_BLOCK_NMS); nms_normal_kernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); } __global__ void nms_kernel_centerpoint(const int num_bboxes, const int num_bboxes_for_nms, const float nms_overlap_thresh, const int decode_bboxes_dims, const float *bboxes, const int *index, const int64_t *sorted_index, int64_t *mask) { // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] // params: mask (N, N/THREADS_PER_BLOCK_NMS) const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = fminf(num_bboxes_for_nms - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); const int col_size = fminf(num_bboxes_for_nms - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; if (threadIdx.x < col_size) { int box_idx = index[sorted_index[THREADS_PER_BLOCK_NMS * col_start + threadIdx.x]]; block_boxes[threadIdx.x * 7 + 0] = bboxes[box_idx * decode_bboxes_dims]; block_boxes[threadIdx.x * 7 + 1] = bboxes[box_idx * decode_bboxes_dims + 1]; block_boxes[threadIdx.x * 7 + 2] = bboxes[box_idx * decode_bboxes_dims + 2]; block_boxes[threadIdx.x * 7 + 3] = bboxes[box_idx * decode_bboxes_dims + 4]; block_boxes[threadIdx.x * 7 + 4] = bboxes[box_idx * decode_bboxes_dims + 3]; block_boxes[threadIdx.x * 7 + 5] = bboxes[box_idx * decode_bboxes_dims + 5]; block_boxes[threadIdx.x * 7 + 6] = -bboxes[box_idx * decode_bboxes_dims + decode_bboxes_dims - 1] - 3.141592653589793 / 2; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const int act_box_idx = index[sorted_index[cur_box_idx]]; float cur_box[7]; cur_box[0] = bboxes[act_box_idx * decode_bboxes_dims]; cur_box[1] = bboxes[act_box_idx * decode_bboxes_dims + 1]; cur_box[2] = bboxes[act_box_idx * decode_bboxes_dims + 2]; cur_box[3] = bboxes[act_box_idx * decode_bboxes_dims + 4]; cur_box[4] = bboxes[act_box_idx * decode_bboxes_dims + 3]; cur_box[5] = bboxes[act_box_idx * decode_bboxes_dims + 5]; cur_box[6] = -bboxes[act_box_idx * decode_bboxes_dims + decode_bboxes_dims - 1] - 3.141592653589793 / 2; int i = 0; int64_t t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { t |= 1ULL << i; } } int col_blocks = DIVUP(num_bboxes_for_nms, THREADS_PER_BLOCK_NMS); mask[cur_box_idx * col_blocks + col_start] = t; } } void NmsLauncher(const cudaStream_t &stream, const float *bboxes, const int *index, const int64_t *sorted_index, const int num_bboxes, const int num_bboxes_for_nms, const float nms_overlap_thresh, const int decode_bboxes_dims, int64_t *mask) { dim3 blocks(DIVUP(num_bboxes_for_nms, THREADS_PER_BLOCK_NMS), DIVUP(num_bboxes_for_nms, THREADS_PER_BLOCK_NMS)); dim3 threads(THREADS_PER_BLOCK_NMS); nms_kernel_centerpoint<<>>( num_bboxes, num_bboxes_for_nms, nms_overlap_thresh, decode_bboxes_dims, bboxes, index, sorted_index, mask); } } // namespace fastdeploy } // namespace paddle_custom_ops