feat(face): add yolov5 face detecter

2025-10-05 13:46:52 +08:00 · 2021-10-29 19:05:58 +08:00
parent a61f9dc7b0
commit 50e43fc864
49 changed files with 1057 additions and 736 deletions
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ cmake .. # optional -DNCNN_VULKAN=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COM
    - mtcnn [Google Drive](https://drive.google.com/drive/folders/14ToHyDXZr4Ihuk8WYp1mVS7QnVxnzEjn?usp=sharing)
    - centerface [Google Drive](https://drive.google.com/drive/folders/1xMhO6aCnkkjt90Fh8BxVD_JHB3QJ2q-q?usp=sharing)
    - retainface [Google Drive](https://drive.google.com/drive/folders/1nxR3WFqqEWLwGVsp5c4tI0_iVVEaVOe8?usp=sharing)
    - yoloface [Google Drive](https://drive.google.com/drive/folders/1EM9H6-aYXKsWTRxx_wbKDyYHVIYpU6f7?usp=sharing)
    - anticonv (for mask detection) [Google Drive](https://drive.google.com/drive/folders/1Fje0fmVPy5g0_oaxUbH_cAedkgjBf7QW?usp=sharing)
  - recognizer (face feature extration for classification)
    - mobilenet [Google Drive](https://drive.google.com/drive/folders/1fRLs10atm_vwDWQXZ-GJbKQpypNcXLAx?usp=sharing)
--- a/go/common/keypoint.go
+++ b/go/common/keypoint.go
@@ -0,0 +1,61 @@
 package common
 /*
 #include <stdlib.h>
 #include <stdbool.h>
 #include "openvision/common/common.h"
 */
 import "C"
 import (
 	"unsafe"
 )
 // Keypoint represents detected body keypoint
 type Keypoint struct {
 	// Point keypoint location
 	Point Point
 	// Score keypoint prob
 	Score float32
 }
 // GoKeypoint convert C.Keypoint to go type
 func GoKeypoint(c *C.Keypoint, w float64, h float64) Keypoint {
 	return Keypoint{
 		Point: Pt(float64(c.p.x)/w, float64(c.p.y)/h),
 		Score: float32(c.prob),
 	}
 }
 // Convert Keypoint to C.Keypoint pointer
 func (k Keypoint) CKeypoint(w float64, h float64) *C.Keypoint {
 	ret := (*C.Keypoint)(C.malloc(C.sizeof_Keypoint))
 	ret.prob = C.float(k.Score)
 	ret.p = C.Point2f{
 		C.float(k.Point.X * w),
 		C.float(k.Point.Y * h),
 	}
 	return ret
 }
 // NewCKeypointVector returns *C.KeypointVector
 func NewCKeypointVector() *C.KeypointVector {
 	return (*C.KeypointVector)(C.malloc(C.sizeof_KeypointVector))
 }
 // FreeCKeypointVector release *C.KeypointVector memory
 func FreeCKeypointVector(points *C.KeypointVector) {
 	C.FreeKeypointVector(points)
 	C.free(unsafe.Pointer(points))
 }
 // GoKeypointVector convert *C.KeypointVector to Keypoint slice
 func GoKeypointVector(c *C.KeypointVector, w float64, h float64) []Keypoint {
 	l := int(c.length)
 	ret := make([]Keypoint, 0, l)
 	ptr := unsafe.Pointer(c.points)
 	for i := 0; i < l; i++ {
 		cKeypoint := (*C.Keypoint)(unsafe.Pointer(uintptr(ptr) + uintptr(C.sizeof_Keypoint*C.int(i))))
 		ret = append(ret, GoKeypoint(cKeypoint, w, h))
 	}
 	return ret
 }
--- a/go/common/objectinfo.go
+++ b/go/common/objectinfo.go
@@ -0,0 +1,98 @@
 package common
 /*
 #include <stdlib.h>
 #include <stdbool.h>
 #include "openvision/common/common.h"
 */
 import "C"
 import (
 	"unsafe"
 )
 // ObjectInfo represents detected roi object info
 type ObjectInfo struct {
 	// Score detected score
 	Score float32
 	// Label
 	Label int
 	// Rect roi location
 	Rect Rectangle
 	// Points keypoints
 	Keypoints []Keypoint
 }
 // GoObjectInfo convert C.ObjectInfo to go type
 func GoObjectInfo(c *C.ObjectInfo, w float64, h float64) ObjectInfo {
 	ret := ObjectInfo{
 		Label: int(c.label),
 		Score: float32(c.prob),
 		Rect: Rect(
 			float64(c.rect.x)/w,
 			float64(c.rect.y)/h,
 			float64(c.rect.width)/w,
 			float64(c.rect.height)/h,
 		),
 	}
 	if c.pts != nil {
 		ret.Keypoints = GoKeypointVector(c.pts, w, h)
 	}
 	return ret
 }
 // ToCObjectInfo returns ObjectInfo C type
 func (o ObjectInfo) ToCObjectInfo(w float64, h float64) *C.ObjectInfo {
 	ret := (*C.ObjectInfo)(C.malloc(C.sizeof_ObjectInfo))
 	ret.label = C.int(o.Label)
 	ret.prob = C.float(o.Score)
 	ret.rect.x = C.int(o.Rect.X * w)
 	ret.rect.y = C.int(o.Rect.Y * h)
 	ret.rect.width = C.int(o.Rect.Width * w)
 	ret.rect.height = C.int(o.Rect.Height * h)
 	if len(o.Keypoints) > 0 {
 		ret.pts = (*C.KeypointVector)(C.malloc(C.sizeof_KeypointVector))
 		ret.pts.length = C.int(len(o.Keypoints))
 		ret.pts.points = (*C.Keypoint)(C.malloc(C.sizeof_Keypoint))
 		for idx, p := range o.Keypoints {
 			pt := C.Keypoint{
 				C.Point2f{C.float(p.Point.X * w), C.float(p.Point.Y * h)},
 				C.float(p.Score),
 			}
 			C.KeypointVectorSetValue(ret.pts, C.int(idx), &pt)
 		}
 	}
 	return ret
 }
 // NewCObjectInfoector returns *C.ObjectInfoVector
 func NewCObjectInfoVector() *C.ObjectInfoVector {
 	return (*C.ObjectInfoVector)(C.malloc(C.sizeof_ObjectInfoVector))
 }
 // FreeCObjectInfoVector release *C.ObjectInfoVector memory
 func FreeCObjectInfoVector(p *C.ObjectInfoVector) {
 	C.FreeObjectInfoVector(p)
 	C.free(unsafe.Pointer(p))
 }
 // GoObjectInfoVector convert *C.ObjectInfoVector to ROI slice
 func GoObjectInfoVector(c *C.ObjectInfoVector, w float64, h float64) []ObjectInfo {
 	l := int(c.length)
 	ret := make([]ObjectInfo, 0, l)
 	ptr := unsafe.Pointer(c.items)
 	for i := 0; i < l; i++ {
 		cVal := (*C.ObjectInfo)(unsafe.Pointer(uintptr(ptr) + uintptr(C.sizeof_ObjectInfo*C.int(i))))
 		ret = append(ret, GoObjectInfo(cVal, w, h))
 	}
 	return ret
 }
 // CObjectInfoVectiorLength get C.ObjectInfoVector length
 func CObjectInfoVectiorLength(c *C.ObjectInfoVector) int {
 	return int(c.length)
 }
 // CObjectInfoVectorPtr get C.ObjectInfoVector start pointer
 func CObjectInfoVectorPtr(c *C.ObjectInfoVector) unsafe.Pointer {
 	return unsafe.Pointer(c.items)
 }
--- a/go/examples/detecter/main.go
+++ b/go/examples/detecter/main.go
@@ -32,6 +32,7 @@ func test_detect(imgPath string, modelPath string) {
 		retinaface(modelPath),
 		centerface(modelPath),
 		mtcnn(modelPath),
 		yoloface(modelPath),
 	} {
 		detect(d, imgPath, idx, "4.jpg", false)
 		d.Destroy()
@@ -62,6 +63,15 @@ func mtcnn(modelPath string) detecter.Detecter {
 	return d
 }
 func yoloface(modelPath string) detecter.Detecter {
 	modelPath = filepath.Join(modelPath, "yoloface/v505")
 	d := detecter.NewYoloFace()
 	if err := d.LoadModel(modelPath); err != nil {
 		log.Fatalln(err)
 	}
 	return d
 }
 func centerface(modelPath string) detecter.Detecter {
 	modelPath = filepath.Join(modelPath, "centerface")
 	d := detecter.NewCenterface()
--- a/go/examples/hand/main.go
+++ b/go/examples/hand/main.go
@@ -79,7 +79,13 @@ func detect(d detecter.Detecter, e pose.Estimator, imgPath string, filename stri
 			log.Fatalln(err)
 			continue
 		}
-		rois[idx].Keypoints = keypoints
+		pts := make([]common.Keypoint, 0, len(keypoints))
 		for _, pt := range keypoints {
 			pts = append(pts, common.Keypoint{
 				Point: pt,
 			})
 		}
 		rois[idx].Keypoints = pts
 		log.Printf("keypoints: %d\n", len(keypoints))
 	}
--- a/go/face/detecter/yoloface.go
+++ b/go/face/detecter/yoloface.go
@@ -0,0 +1,44 @@
 package detecter
 /*
 #include <stdlib.h>
 #include <stdbool.h>
 #include "openvision/face/detecter.h"
 */
 import "C"
 import (
 	"github.com/bububa/openvision/go/common"
 	"github.com/bububa/openvision/go/face"
 )
 // YoloFace represents yoloface detecter
 type YoloFace struct {
 	d C.IFaceDetecter
 }
 // NewYoloFace returns a new YoloFace
 func NewYoloFace() *YoloFace {
 	return &YoloFace{
 		d: C.new_yoloface(),
 	}
 }
 // Destroy free detecter
 func (d *YoloFace) Destroy() {
 	Destroy(d)
 }
 // Handler returns C.IFaceDetecter
 func (d *YoloFace) Handler() C.IFaceDetecter {
 	return d.d
 }
 // LoadModel implement Detecter interface
 func (d *YoloFace) LoadModel(modelPath string) error {
 	return LoadModel(d, modelPath)
 }
 // DetectFace implement Detecter interface
 func (d *YoloFace) DetectFace(img *common.Image) ([]face.FaceInfo, error) {
 	return DetectFace(d, img)
 }
--- a/go/hand/detecter/detecter.go
+++ b/go/hand/detecter/detecter.go
@@ -3,7 +3,7 @@ package detecter
 /*
 #include <stdlib.h>
 #include <stdbool.h>
-#include "openvision/hand/common.h"
+#include "openvision/common/common.h"
 #include "openvision/hand/detecter.h"
 */
 import "C"
@@ -12,14 +12,13 @@ import (
 	openvision "github.com/bububa/openvision/go"
 	"github.com/bububa/openvision/go/common"
 	"github.com/bububa/openvision/go/hand"
 )
 // Detecter represents deteter interface
 type Detecter interface {
 	Handler() C.IHandDetecter
 	LoadModel(modelPath string) error
-	Detect(img *common.Image) ([]hand.ROI, error)
+	Detect(img *common.Image) ([]common.ObjectInfo, error)
 	Destroy()
 }
@@ -40,20 +39,20 @@ func Destroy(d Detecter) {
 }
 // Detect detect hand ROI
-func Detect(d Detecter, img *common.Image) ([]hand.ROI, error) {
+func Detect(d Detecter, img *common.Image) ([]common.ObjectInfo, error) {
 	imgWidth := img.WidthF64()
 	imgHeight := img.HeightF64()
 	data := img.Bytes()
-	cROIs := hand.NewCROIVector()
+	cObjs := common.NewCObjectInfoVector()
-	defer hand.FreeCROIVector(cROIs)
+	defer common.FreeCObjectInfoVector(cObjs)
 	errCode := C.extract_hand_rois(
 		d.Handler(),
 		(*C.uchar)(unsafe.Pointer(&data[0])),
 		C.int(imgWidth),
 		C.int(imgHeight),
-		(*C.HandROIVector)(unsafe.Pointer(cROIs)))
+		(*C.ObjectInfoVector)(unsafe.Pointer(cObjs)))
 	if errCode != 0 {
 		return nil, openvision.DetectHandError(int(errCode))
 	}
-	return hand.GoROIVector(cROIs, imgWidth, imgHeight), nil
+	return common.GoObjectInfoVector(cObjs, imgWidth, imgHeight), nil
 }
--- a/go/hand/detecter/nanodet.go
+++ b/go/hand/detecter/nanodet.go
@@ -8,7 +8,6 @@ package detecter
 import "C"
 import (
 	"github.com/bububa/openvision/go/common"
 	"github.com/bububa/openvision/go/hand"
 )
 // Nanodet represents nanodet detecter
@@ -39,6 +38,6 @@ func (d *Nanodet) LoadModel(modelPath string) error {
 }
 // Detect implement Detecter interface
-func (d *Nanodet) Detect(img *common.Image) ([]hand.ROI, error) {
+func (d *Nanodet) Detect(img *common.Image) ([]common.ObjectInfo, error) {
 	return Detect(d, img)
 }
--- a/go/hand/detecter/yolox.go
+++ b/go/hand/detecter/yolox.go
@@ -8,7 +8,6 @@ package detecter
 import "C"
 import (
 	"github.com/bububa/openvision/go/common"
 	"github.com/bububa/openvision/go/hand"
 )
 // Yolox represents yolox detecter
@@ -39,6 +38,6 @@ func (d *Yolox) LoadModel(modelPath string) error {
 }
 // Detect implement Detecter interface
-func (d *Yolox) Detect(img *common.Image) ([]hand.ROI, error) {
+func (d *Yolox) Detect(img *common.Image) ([]common.ObjectInfo, error) {
 	return Detect(d, img)
 }
--- a/go/hand/drawer/drawer.go
+++ b/go/hand/drawer/drawer.go
@@ -6,7 +6,6 @@ import (
 	"github.com/llgcode/draw2d/draw2dimg"
 	"github.com/bububa/openvision/go/common"
 	"github.com/bububa/openvision/go/hand"
 )
 // Drawer represents a hand drawer
@@ -36,7 +35,7 @@ func New(options ...Option) *Drawer {
 }
 // Draw draw rois
-func (d *Drawer) Draw(img image.Image, rois []hand.ROI, drawBorder bool) image.Image {
+func (d *Drawer) Draw(img image.Image, rois []common.ObjectInfo, drawBorder bool) image.Image {
 	imgW := float64(img.Bounds().Dx())
 	imgH := float64(img.Bounds().Dy())
 	out := image.NewRGBA(img.Bounds())
@@ -68,8 +67,8 @@ func (d *Drawer) Draw(img image.Image, rois []hand.ROI, drawBorder bool) image.I
 			)
 			gc.SetStrokeColor(common.ColorFromHex(poseColor))
 			if idx == 5 || idx == 9 || idx == 13 || idx == 17 {
-				p0 = roi.Keypoints[0]
+				p0 = roi.Keypoints[0].Point
-				p1 = roi.Keypoints[idx]
+				p1 = roi.Keypoints[idx].Point
 				gc.BeginPath()
 				gc.MoveTo(p0.X*imgW, p0.Y*imgH)
 				gc.LineTo(p1.X*imgW, p1.Y*imgH)
@@ -78,8 +77,8 @@ func (d *Drawer) Draw(img image.Image, rois []hand.ROI, drawBorder bool) image.I
 			} else if idx == 4 || idx == 8 || idx == 12 || idx == 16 {
 				continue
 			}
-			p0 = roi.Keypoints[idx]
+			p0 = roi.Keypoints[idx].Point
-			p1 = roi.Keypoints[idx+1]
+			p1 = roi.Keypoints[idx+1].Point
 			gc.BeginPath()
 			gc.MoveTo(p0.X*imgW, p0.Y*imgH)
 			gc.LineTo(p1.X*imgW, p1.Y*imgH)
@@ -94,7 +93,7 @@ func (d *Drawer) Draw(img image.Image, rois []hand.ROI, drawBorder bool) image.I
 				colorIdx--
 			}
 			poseColor := PoseColors[colorIdx]
-			common.DrawCircle(gc, common.Pt(pt.X*imgW, pt.Y*imgH), d.KeypointRadius, poseColor, "", d.KeypointStrokeWidth)
+			common.DrawCircle(gc, common.Pt(pt.Point.X*imgW, pt.Point.Y*imgH), d.KeypointRadius, poseColor, "", d.KeypointStrokeWidth)
 		}
 	}
 	return out
--- a/go/hand/pose/estimator.go
+++ b/go/hand/pose/estimator.go
@@ -3,7 +3,7 @@ package pose
 /*
 #include <stdlib.h>
 #include <stdbool.h>
-#include "openvision/hand/common.h"
+#include "openvision/common/common.h"
 #include "openvision/hand/pose.h"
 */
 import "C"
--- a/go/hand/roi.go
+++ b/go/hand/roi.go
@@ -1,84 +0,0 @@
 package hand
 /*
 #include <stdlib.h>
 #include <stdbool.h>
 #include "openvision/hand/common.h"
 */
 import "C"
 import (
 	"unsafe"
 	"github.com/bububa/openvision/go/common"
 )
 // ROI represents detected person roi
 type ROI struct {
 	// Score detected score
 	Score float32
 	// Label
 	Label int
 	// Rect roi location
 	Rect common.Rectangle
 	// Points keypoints
 	Keypoints []common.Point
 }
 // GoROI convert C.HandROI to go type
 func GoROI(c *C.HandROI, w float64, h float64) ROI {
 	return ROI{
 		Label: int(c.label),
 		Score: float32(c.prob),
 		Rect: common.Rect(
 			float64(c.rect.x)/w,
 			float64(c.rect.y)/h,
 			float64(c.rect.width)/w,
 			float64(c.rect.height)/h,
 		),
 	}
 }
 // ToCROI returns ROI C type
 func (r ROI) ToCROI(w float64, h float64) *C.HandROI {
 	ret := (*C.HandROI)(C.malloc(C.sizeof_HandROI))
 	ret.label = C.int(r.Label)
 	ret.prob = C.float(r.Score)
 	ret.rect.x = C.int(r.Rect.X * w)
 	ret.rect.y = C.int(r.Rect.Y * h)
 	ret.rect.width = C.int(r.Rect.Width * w)
 	ret.rect.height = C.int(r.Rect.Height * h)
 	return ret
 }
 // NewROIVector returns *C.HandROIVector
 func NewCROIVector() *C.HandROIVector {
 	return (*C.HandROIVector)(C.malloc(C.sizeof_HandROIVector))
 }
 // FreeCROIVector release *C.HandROIVectore memory
 func FreeCROIVector(p *C.HandROIVector) {
 	C.FreeHandROIVector(p)
 	C.free(unsafe.Pointer(p))
 }
 // GoROIVector convert *C.HandROIVector to ROI slice
 func GoROIVector(c *C.HandROIVector, w float64, h float64) []ROI {
 	l := int(c.length)
 	ret := make([]ROI, 0, l)
 	ptr := unsafe.Pointer(c.items)
 	for i := 0; i < l; i++ {
 		cVal := (*C.HandROI)(unsafe.Pointer(uintptr(ptr) + uintptr(C.sizeof_HandROI*C.int(i))))
 		ret = append(ret, GoROI(cVal, w, h))
 	}
 	return ret
 }
 // CROIVectiorLength get C.HandROIVector length
 func CROIVectiorLength(c *C.HandROIVector) int {
 	return int(c.length)
 }
 // CROIVectorPtr get C.HandROIVector start pointer
 func CROIVectorPtr(c *C.HandROIVector) unsafe.Pointer {
 	return unsafe.Pointer(c.items)
 }
--- a/go/pose/detecter/detecter.go
+++ b/go/pose/detecter/detecter.go
@@ -3,7 +3,7 @@ package detecter
 /*
 #include <stdlib.h>
 #include <stdbool.h>
-#include "openvision/pose/common.h"
+#include "openvision/common/common.h"
 #include "openvision/pose/detecter.h"
 */
 import "C"
@@ -12,14 +12,13 @@ import (
 	openvision "github.com/bububa/openvision/go"
 	"github.com/bububa/openvision/go/common"
 	"github.com/bububa/openvision/go/pose"
 )
 // Detecter represents deteter interface
 type Detecter interface {
 	Handler() C.IPoseDetecter
 	LoadModel(modelPath string) error
-	ExtractKeypoints(img *common.Image) ([]pose.ROI, error)
+	ExtractKeypoints(img *common.Image) ([]common.ObjectInfo, error)
 	Destroy()
 }
@@ -40,37 +39,40 @@ func Destroy(d Detecter) {
 }
 // ExtractKeypoints detect pose keypoints using detecter
-func ExtractKeypoints(d Detecter, img *common.Image) ([]pose.ROI, error) {
+func ExtractKeypoints(d Detecter, img *common.Image) ([]common.ObjectInfo, error) {
 	imgWidth := img.WidthF64()
 	imgHeight := img.HeightF64()
 	data := img.Bytes()
-	cROIs := pose.NewCROIVector()
+	cObjs := common.NewCObjectInfoVector()
-	defer pose.FreeCROIVector(cROIs)
+	defer common.FreeCObjectInfoVector(cObjs)
 	errCode := C.extract_pose_rois(
 		d.Handler(),
 		(*C.uchar)(unsafe.Pointer(&data[0])),
 		C.int(imgWidth),
 		C.int(imgHeight),
-		(*C.PoseROIVector)(unsafe.Pointer(cROIs)))
+		(*C.ObjectInfoVector)(unsafe.Pointer(cObjs)))
 	if errCode != 0 {
 		return nil, openvision.DetectPoseError(int(errCode))
 	}
-	totalROIs := pose.CROIVectiorLength(cROIs)
+	totalROIs := common.CObjectInfoVectiorLength(cObjs)
-	rois := make([]pose.ROI, 0, totalROIs)
+	rois := make([]common.ObjectInfo, 0, totalROIs)
-	ptr := pose.CROIVectorPtr(cROIs)
+	ptr := common.CObjectInfoVectorPtr(cObjs)
 	for i := 0; i < totalROIs; i++ {
-		cKeypoints := pose.NewCKeypointVector()
+		cKeypoints := common.NewCKeypointVector()
-		defer pose.FreeCKeypointVector(cKeypoints)
+		defer common.FreeCKeypointVector(cKeypoints)
-		cROI := (*C.PoseROI)(unsafe.Pointer(uintptr(ptr) + uintptr(C.sizeof_PoseROI*C.int(i))))
+		cROI := (*C.ObjectInfo)(unsafe.Pointer(uintptr(ptr) + uintptr(C.sizeof_ObjectInfo*C.int(i))))
 		errCode := C.extract_pose_keypoints(
 			d.Handler(),
-			cROI,
+			(*C.uchar)(unsafe.Pointer(&data[0])),
-			(*C.PoseKeypointVector)(unsafe.Pointer(cKeypoints)))
+			C.int(imgWidth),
 			C.int(imgHeight),
 			(*C.Rect)(unsafe.Pointer(&cROI.rect)),
 			(*C.KeypointVector)(unsafe.Pointer(cKeypoints)))
 		if errCode != 0 {
 			return nil, openvision.DetectPoseError(int(errCode))
 		}
-		keypoints := pose.GoKeypointVector(cKeypoints, imgWidth, imgHeight)
+		keypoints := common.GoKeypointVector(cKeypoints, imgWidth, imgHeight)
-		rois = append(rois, pose.ROI{
+		rois = append(rois, common.ObjectInfo{
 			Keypoints: keypoints,
 			Rect: common.Rect(
 				float64(cROI.rect.x)/imgWidth,
@@ -78,7 +80,7 @@ func ExtractKeypoints(d Detecter, img *common.Image) ([]pose.ROI, error) {
 				float64(cROI.rect.width)/imgWidth,
 				float64(cROI.rect.height)/imgHeight,
 			),
-			Score: float32(cROI.score),
+			Score: float32(cROI.prob),
 		})
 	}
--- a/go/pose/detecter/ultralight.go
+++ b/go/pose/detecter/ultralight.go
@@ -8,7 +8,6 @@ package detecter
 import "C"
 import (
 	"github.com/bububa/openvision/go/common"
 	"github.com/bububa/openvision/go/pose"
 )
 // Ultralight represents utralight detecter
@@ -39,6 +38,6 @@ func (d *Ultralight) LoadModel(modelPath string) error {
 }
 // ExtractKeypoints implement Detecter interface
-func (d *Ultralight) ExtractKeypoints(img *common.Image) ([]pose.ROI, error) {
+func (d *Ultralight) ExtractKeypoints(img *common.Image) ([]common.ObjectInfo, error) {
 	return ExtractKeypoints(d, img)
 }
--- a/go/pose/drawer/drawer.go
+++ b/go/pose/drawer/drawer.go
@@ -6,7 +6,6 @@ import (
 	"github.com/llgcode/draw2d/draw2dimg"
 	"github.com/bububa/openvision/go/common"
 	"github.com/bububa/openvision/go/pose"
 )
 // Drawer represents a pose drawer
@@ -36,7 +35,7 @@ func New(options ...Option) *Drawer {
 }
 // Draw draw rois
-func (d *Drawer) Draw(img image.Image, rois []pose.ROI, drawBorder bool) image.Image {
+func (d *Drawer) Draw(img image.Image, rois []common.ObjectInfo, drawBorder bool) image.Image {
 	imgW := float64(img.Bounds().Dx())
 	imgH := float64(img.Bounds().Dy())
 	out := image.NewRGBA(img.Bounds())
--- a/go/pose/keypoint.go
+++ b/go/pose/keypoint.go
@@ -1,63 +0,0 @@
 package pose
 /*
 #include <stdlib.h>
 #include <stdbool.h>
 #include "openvision/pose/common.h"
 */
 import "C"
 import (
 	"unsafe"
 	"github.com/bububa/openvision/go/common"
 )
 // Keypoint represents detected body keypoint
 type Keypoint struct {
 	// Point keypoint location
 	Point common.Point
 	// Score keypoint prob
 	Score float32
 }
 // GoKeypoint convert C.PoseKeypoint to go type
 func GoKeypoint(c *C.PoseKeypoint, w float64, h float64) Keypoint {
 	return Keypoint{
 		Point: common.Pt(float64(c.p.x)/w, float64(c.p.y)/h),
 		Score: float32(c.prob),
 	}
 }
 // Convert Keypoint to C.Keypoint pointer
 func (k Keypoint) CKeypoint(w float64, h float64) *C.PoseKeypoint {
 	ret := (*C.PoseKeypoint)(C.malloc(C.sizeof_PoseKeypoint))
 	ret.prob = C.float(k.Score)
 	ret.p = C.Point2f{
 		C.float(k.Point.X * w),
 		C.float(k.Point.Y * h),
 	}
 	return ret
 }
 // NewCKeypointVector returns *C.PoseKeypointVector
 func NewCKeypointVector() *C.PoseKeypointVector {
 	return (*C.PoseKeypointVector)(C.malloc(C.sizeof_PoseKeypointVector))
 }
 // FreeCKeypointVector release *C.PoseKeypointVector memory
 func FreeCKeypointVector(points *C.PoseKeypointVector) {
 	C.FreePoseKeypointVector(points)
 	C.free(unsafe.Pointer(points))
 }
 // GoKeypointVector convert *C.PoseKeypointVector to Keypoint slice
 func GoKeypointVector(c *C.PoseKeypointVector, w float64, h float64) []Keypoint {
 	l := int(c.length)
 	ret := make([]Keypoint, 0, l)
 	ptr := unsafe.Pointer(c.points)
 	for i := 0; i < l; i++ {
 		cKeypoint := (*C.PoseKeypoint)(unsafe.Pointer(uintptr(ptr) + uintptr(C.sizeof_PoseKeypoint*C.int(i))))
 		ret = append(ret, GoKeypoint(cKeypoint, w, h))
 	}
 	return ret
 }
--- a/go/pose/roi.go
+++ b/go/pose/roi.go
@@ -1,42 +0,0 @@
 package pose
 /*
 #include <stdlib.h>
 #include <stdbool.h>
 #include "openvision/pose/common.h"
 */
 import "C"
 import (
 	"unsafe"
 	"github.com/bububa/openvision/go/common"
 )
 // ROI represents detected person roi
 type ROI struct {
 	// Score detected score
 	Score float32
 	// Rect roi location
 	Rect common.Rectangle
 	// Keypoints
 	Keypoints []Keypoint
 }
 // NewROIVector returns *C.PoseROIVector
 func NewCROIVector() *C.PoseROIVector {
 	return (*C.PoseROIVector)(C.malloc(C.sizeof_PoseROIVector))
 }
 // FreeCROIVector release *C.PoseROIVectore memory
 func FreeCROIVector(p *C.PoseROIVector) {
 	C.FreePoseROIVector(p)
 	C.free(unsafe.Pointer(p))
 }
 func CROIVectiorLength(c *C.PoseROIVector) int {
 	return int(c.length)
 }
 func CROIVectorPtr(c *C.PoseROIVector) unsafe.Pointer {
 	return unsafe.Pointer(c.items)
 }
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -61,12 +61,10 @@ target_include_directories(openvision
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/face/hopenet>
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/hand>
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/hand/common>
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/hand/detecter>
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/hand/pose>
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/pose>
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/pose/common>
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/pose/detecter>
 )
@@ -87,14 +85,12 @@ file(COPY
 )
 file(COPY
    ${CMAKE_CURRENT_SOURCE_DIR}/hand/common.h
    ${CMAKE_CURRENT_SOURCE_DIR}/hand/detecter.h
    ${CMAKE_CURRENT_SOURCE_DIR}/hand/pose.h
    DESTINATION ${INCLUDE_OUTPUT_PATH}/openvision/hand
 )
 file(COPY
    ${CMAKE_CURRENT_SOURCE_DIR}/pose/common.h
    ${CMAKE_CURRENT_SOURCE_DIR}/pose/detecter.h
    DESTINATION ${INCLUDE_OUTPUT_PATH}/openvision/pose
 )
--- a/src/common/common.cpp
+++ b/src/common/common.cpp
@@ -2,6 +2,7 @@
 #include <algorithm>
 #include <iostream>
 #include <math.h>
 #include <float.h>
 #ifdef OV_VULKAN
 #include "gpu.h"
@@ -42,6 +43,13 @@ void FreePoint2fVector(Point2fVector* p) {
    }
 }
 void Point2fVectorSetValue(Point2fVector *p, int i, const Point2f* val) {
    if (p->points == NULL || i >= p->length) {
        return;
    }
    p->points[i] = *val;
 }
 void FreeFloatVector(FloatVector *p) {
    if (p->values != NULL) {
        free(p->values);
@@ -56,6 +64,37 @@ void FreeBytes(Bytes *p) {
    }
 }
 void FreeKeypointVector(KeypointVector *p) {
    if (p->points != NULL) {
        free(p->points);
        p->points = NULL;
    }
 }
 void KeypointVectorSetValue(KeypointVector *p, int i, const Keypoint* val) {
    if (p->points == NULL || i >= p->length) {
        return;
    }
    p->points[i] = *val;
 }
 void FreeObjectInfo(ObjectInfo *p) {
    if (p->pts != NULL) {
        FreeKeypointVector(p->pts);
        free(p->pts);
        p->pts = NULL;
    }
 }
 void FreeObjectInfoVector(ObjectInfoVector *p) {
    if (p->items!=NULL) {
        for (int i=0; i < p->length; i ++) {
            FreeObjectInfo(&p->items[i]);
        }
        free(p->items);
        p->items= NULL;
    }
 }
 namespace ov {
 int RatioAnchors(const Rect & anchor,
@@ -164,4 +203,105 @@ void RectifyRect(Rect* rect) {
 	rect->height = max_side;    
 }
 void qsort_descent_inplace(std::vector<ObjectInfo>& objects, int left, int right)
 {
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;
    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;
        while (objects[j].prob < p)
            j--;
        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);
            i++;
            j--;
        }
    }
    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
 }
 void qsort_descent_inplace(std::vector<ObjectInfo>& objects) 
 {
    if (objects.empty())
        return;
    qsort_descent_inplace(objects, 0, objects.size() - 1);
 }
 void nms_sorted_bboxes(const std::vector<ObjectInfo>& objects, std::vector<int>& picked, float nms_threshold)
 {
    picked.clear();
    const int n = objects.size();
    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rect.area();
    }
    for (int i = 0; i < n; i++)
    {
        const ObjectInfo& a = objects[i];
        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const ObjectInfo& b = objects[picked[j]];
            // intersection over union
            float inter_area = InterRectArea(a.rect, b.rect);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }
        if (keep)
            picked.push_back(i);
    }
 }
 int generate_grids_and_stride(const int target_size, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
 {
    for (auto stride : strides)
    {
        int num_grid = target_size / stride;
        for (int g1 = 0; g1 < num_grid; g1++)
        {
            for (int g0 = 0; g0 < num_grid; g0++)
            {
                grid_strides.push_back((GridAndStride){g0, g1, stride});
            }
        }
    }
    return 0;
 }
 float sigmoid(float x)
 {
    return static_cast<float>(1.f / (1.f + exp(-x)));
 }
 }
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -11,6 +11,7 @@ typedef ov::Size Size;
 typedef ov::Point Point;
 typedef ov::Point2f Point2f;
 typedef ov::Rect Rect;
 typedef ov::Keypoint Keypoint;
 #else
 // Wrapper for an individual cv::cvSize
@@ -40,6 +41,12 @@ typedef struct Rect {
    int height;
 } Rect;
 typedef struct Keypoint {
    Point2f p;
    float prob;
 } Keypoint;
 #endif
 typedef void* IEstimator;
@@ -56,6 +63,7 @@ typedef struct Point2fVector {
 } Point2fVector;
 void FreePoint2fVector(Point2fVector *p);
 void Point2fVectorSetValue(Point2fVector *p, int i, const Point2f* val);
 typedef struct RectVector {
    Rect* rects;
@@ -78,6 +86,30 @@ typedef struct Bytes {
 void FreeBytes(Bytes *p);
 typedef struct KeypointVector {
    Keypoint* points;
    int length;
 } KeypointVector;
 void FreeKeypointVector(KeypointVector *p);
 void KeypointVectorSetValue(KeypointVector *p, int i, const Keypoint* val);
 typedef struct ObjectInfoC {
    Rect rect;
    float prob;
    int label;
    KeypointVector* pts;
 } ObjectInfo;
 void FreeObjectInfo(ObjectInfo *p);
 typedef struct ObjectInfoVector {
    ObjectInfo* items;
    int length;
 } ObjectInfoVector;
 void FreeObjectInfoVector(ObjectInfoVector *p);
 #ifdef __cplusplus
 }
 #endif
--- a/src/common/common.hpp
+++ b/src/common/common.hpp
@@ -4,6 +4,7 @@
 #include <vector>
 #include <string>
 #include "config.h"
 #include "net.h"
 #ifdef OV_OPENMP 
 #include <omp.h>
 #endif
@@ -80,10 +81,23 @@ struct ImageInfo {
    float score_;
 };
 struct Keypoint {
    ov::Point2f p;
    float prob;
 };
 struct ObjectInfo {
-	Rect location_;
+	Rect rect;
-	float score_;
+	float prob;
-	std::string name_;
+    int label;
    std::vector<Point2f> pts;
 };
 struct GridAndStride
 {
    int grid0;
    int grid1;
    int stride;
 };
 int RatioAnchors(const Rect & anchor,
@@ -140,6 +154,16 @@ int const NMS(const std::vector<T>& inputs, std::vector<T>* result,
    return 0;
 }
 void qsort_descent_inplace(std::vector<ObjectInfo>& objects, int left, int right);
 void qsort_descent_inplace(std::vector<ObjectInfo>& objects); 
 void nms_sorted_bboxes(const std::vector<ObjectInfo>& objects, std::vector<int>& picked, float nms_threshold);
 int generate_grids_and_stride(const int target_size, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides);
 float sigmoid(float x);
 void EnlargeRect(const float& scale, Rect* rect);
 void RectifyRect(Rect* rect);
--- a/src/face/detecter.h
+++ b/src/face/detecter.h
@@ -11,6 +11,7 @@ extern "C" {
    IFaceDetecter new_retinaface();
    IFaceDetecter new_centerface();
    IFaceDetecter new_mtcnn();
    IFaceDetecter new_yoloface();
    IFaceDetecter new_anticonv();
    int detect_face(IFaceDetecter d, const unsigned char* rgbdata, int img_width, int img_height, FaceInfoVector* faces); 
 #ifdef __cplusplus
--- a/src/face/detecter/detecter.cpp
+++ b/src/face/detecter/detecter.cpp
@@ -3,6 +3,7 @@
 #include "mtcnn/mtcnn.hpp"
 #include "retinaface/retinaface.hpp"
 #include "anticonv/anticonv.hpp"
 #include "yoloface/yoloface.hpp"
 IFaceDetecter new_retinaface() {
    return new ovface::RetinaFace();
@@ -16,6 +17,10 @@ IFaceDetecter new_mtcnn() {
    return new ovface::Mtcnn();
 }
 IFaceDetecter new_yoloface() {
    return new ovface::YoloFace();
 }
 IFaceDetecter new_anticonv() {
    return new ovface::AntiConv();
 }
@@ -49,6 +54,10 @@ Detecter* RetinafaceFactory::CreateDetecter() {
 	return new RetinaFace();
 }
 Detecter* YoloFaceFactory::CreateDetecter() {
 	return new YoloFace();
 }
 Detecter* AnticonvFactory::CreateDetecter() {
 	return new AntiConv();
 }
--- a/src/face/detecter/detecter.hpp
+++ b/src/face/detecter/detecter.hpp
@@ -44,6 +44,13 @@ public:
 	Detecter* CreateDetecter();
 };
 class YoloFaceFactory : public DetecterFactory {
 public:
 	YoloFaceFactory() {}
 	~YoloFaceFactory() {}
 	Detecter* CreateDetecter();
 };
 class AnticonvFactory : public DetecterFactory {
 public:
 	AnticonvFactory() {}
--- a/src/face/detecter/yoloface/yoloface.cpp
+++ b/src/face/detecter/yoloface/yoloface.cpp
@@ -0,0 +1,290 @@
 #include "yoloface.hpp"
 #include "../../../common/yolov5focus.hpp"
 #include <string>
 #include <float.h>
 #ifdef OV_VULKAN
 #include "gpu.h"
 #endif // OV_VULKAN
 namespace ovface {
 static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<ov::ObjectInfo>& objects)
 {
    const int num_grid = feat_blob.h;
    int num_grid_x;
    int num_grid_y;
    if (in_pad.w > in_pad.h)
    {
        num_grid_x = in_pad.w / stride;
        num_grid_y = num_grid / num_grid_x;
    }
    else
    {
        num_grid_y = in_pad.h / stride;
        num_grid_x = num_grid / num_grid_y;
    }
    const int num_class = feat_blob.w - 5-10;
    const int num_anchors = anchors.w / 2;
    for (int q = 0; q < num_anchors; q++)
    {
        const float anchor_w = anchors[q * 2];
        const float anchor_h = anchors[q * 2 + 1];
        const ncnn::Mat feat = feat_blob.channel(q);
        for (int i = 0; i < num_grid_y; i++)
        {
            for (int j = 0; j < num_grid_x; j++)
            {
                const float* featptr = feat.row(i * num_grid_x + j);
                // find class index with max class score
                int class_index = 0;
                float class_score = -FLT_MAX;
                for (int k = 0; k < num_class; k++)
                {
                    float score = featptr[5 +10+ k];
                    if (score > class_score)
                    {
                        class_index = k;
                        class_score = score;
                    }
                }
                float box_score = featptr[4];
 				float confidence = ov::sigmoid(box_score); //* sigmoid(class_score);
                if (confidence >= prob_threshold)
                {
                    // yolov5/models/yolo.py Detect forward
                    // y = x[i].sigmoid()
                    // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
                    // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                    float dx = ov::sigmoid(featptr[0]);
                    float dy = ov::sigmoid(featptr[1]);
                    float dw = ov::sigmoid(featptr[2]);
                    float dh = ov::sigmoid(featptr[3]);
                    float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                    float pb_cy = (dy * 2.f - 0.5f + i) * stride;
                    float pb_w = pow(dw * 2.f, 2) * anchor_w;
                    float pb_h = pow(dh * 2.f, 2) * anchor_h;
                    float x0 = pb_cx - pb_w * 0.5f;
                    float y0 = pb_cy - pb_h * 0.5f;
                    float x1 = pb_cx + pb_w * 0.5f;
                    float y1 = pb_cy + pb_h * 0.5f;
                    ov::ObjectInfo obj;
                    obj.rect.x = x0;
                    obj.rect.y = y0;
                    obj.rect.width = x1 - x0;
                    obj.rect.height = y1 - y0;
                    obj.label = class_index;
                    obj.prob = confidence;
 					for (int l = 0; l < 5; l++)
 					{
 						float x = featptr[2 * l + 5] * anchor_w + j * stride;
 						float y = featptr[2 * l + 1 + 5] * anchor_h + i * stride;
 						obj.pts.push_back(ov::Point2f(x, y));
 					}
                    objects.push_back(obj);
                }
            }
        }
    }
 }
 YoloFace::YoloFace() : 
    net_ (new ncnn::Net()),
 	initialized_(false) {
 #ifdef OV_VULKAN
    net_->opt.use_vulkan_compute = true;
 #endif // OV_VULKAN
 }
 YoloFace::~YoloFace() {
    net_->clear();
 }
 int YoloFace::LoadModel(const char * root_path) {
    register_yolov5focus(net_);
 	std::string param_file = std::string(root_path) + "/param";
 	std::string bin_file = std::string(root_path) + "/bin";
 	if (net_->load_param(param_file.c_str()) == -1 ||
 		net_->load_model(bin_file.c_str()) == -1) {
 		return 10000;
 	}
 	initialized_ = true;
 	return 0;
 }
 int YoloFace::DetectFace(const unsigned char* rgbdata,
    int img_width, int img_height, 
    std::vector<FaceInfo>* faces) {
    faces->clear();
 	if (!initialized_) {
 		return 10000;
 	}
 	if (rgbdata == 0){
 		return 10001;
    }
    // letterbox pad to multiple of 32
    int w = img_width;
    int h = img_height;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgbdata, ncnn::Mat::PIXEL_RGB, img_width, img_height, w, h);
    // pad to target_size rectangle
    // yolov5/utils/datasets.py letterbox
 	int wpad = (w + 31) / 32 * 32 - w;
 	int hpad = (h + 31) / 32 * 32 - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
    in_pad.substract_mean_normalize(0, norm_vals);
    ncnn::Extractor ex = net_->create_extractor();
    ex.input("data", in_pad);
    std::vector<ov::ObjectInfo> proposals;
    // anchor setting from yolov5/models/yolov5s.yaml
    // stride 8
    {
        ncnn::Mat out;
        ex.extract("981", out);
        ncnn::Mat anchors(6);
        anchors[0] = 4.f;
        anchors[1] = 5.f;
        anchors[2] = 8.f;
        anchors[3] = 10.f;
        anchors[4] = 13.f;
        anchors[5] = 16.f;
        std::vector<ov::ObjectInfo> objects8;
        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    }
    // stride 16
    {
        ncnn::Mat out;
        ex.extract("983", out);
        ncnn::Mat anchors(6);
        anchors[0] = 23.f;
        anchors[1] = 29.f;
        anchors[2] = 43.f;
        anchors[3] = 55.f;
        anchors[4] = 73.f;
        anchors[5] = 105.f;
        std::vector<ov::ObjectInfo> objects16;
        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    }
    // stride 32
    {
        ncnn::Mat out;
        ex.extract("985", out);
        ncnn::Mat anchors(6);
        anchors[0] = 146.f;
        anchors[1] = 217.f;
        anchors[2] = 231.f;
        anchors[3] = 300.f;
        anchors[4] = 335.f;
        anchors[5] = 433.f;
        std::vector<ov::ObjectInfo> objects32;
        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    }
    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);
    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);
    int count = picked.size();
    for (int i = 0; i < count; i++)
    {
        ov::ObjectInfo obj = proposals[picked[i]];
        // adjust offset to original unpadded
        float x0 = (obj.rect.x - (float(wpad) / 2)) / scale;
        float y0 = (obj.rect.y - (float(hpad) / 2)) / scale;
        float x1 = (obj.rect.x + obj.rect.width - (float(wpad) / 2)) / scale;
        float y1 = (obj.rect.y + obj.rect.height - (float(hpad) / 2)) / scale;
 		for (int j = 0; j < obj.pts.size(); j++)
 		{
 			float ptx = (obj.pts[j].x - (float(wpad) / 2)) / scale;
 			float pty = (obj.pts[j].y - (float(hpad) / 2)) / scale;
 			obj.pts[j] = ov::Point2f(ptx, pty);
 		}
        // clip
        x0 = std::max(std::min(x0, (float)(img_width - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_height - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_width - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_height - 1)), 0.f);
        obj.rect.x = x0;
        obj.rect.y = y0;
        obj.rect.width = x1 - x0;
        obj.rect.height = y1 - y0;
        FaceInfo info;
        info.location_ = obj.rect;
        for (int k = 0; k < 5; ++k) {
            info.keypoints_[k] = obj.pts[k].x;
            info.keypoints_[k + 5] = obj.pts[k].y;
        }
        faces->push_back(info);
    }
    return 0;
 }
 }
--- a/src/face/detecter/yoloface/yoloface.hpp
+++ b/src/face/detecter/yoloface/yoloface.hpp
@@ -0,0 +1,30 @@
 #ifndef _YOLOFACE_H_
 #define _YOLOFACE_H_
 #include "../detecter.hpp"
 #include "net.h"
 namespace ovface {
 class YoloFace : public Detecter {
 public:
 	YoloFace();
 	~YoloFace();
 	int LoadModel(const char* root_path);
 	int DetectFace(const unsigned char* rgbdata,
        int img_width, int img_height,
        std::vector<FaceInfo>* faces);
 private:
 	ncnn::Net* net_;
 	bool initialized_;
    const int target_size = 640;
    const float mean_vals[3] = {127.f, 127.f, 127.f};
    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;
 };
 }
 #endif // !_RETINAFACE_H_
--- a/src/hand/common.h
+++ b/src/hand/common.h
@@ -1,29 +0,0 @@
 #ifndef _HAND_COMMON_C_H_
 #define _HAND_COMMON_C_H_
 #include "../common/common.h"
 #ifdef __cplusplus
 #include "common/common.hpp"
 extern "C" {
 #endif
 #ifdef __cplusplus
 typedef ovhand::HandROI HandROI;
 #else
 typedef struct HandROI {
    Rect rect;
    int label;
    float prob;
 } HandROI;
 #endif
 typedef struct HandROIVector {
    HandROI* items;
    int length;
 } HandROIVector;
 void FreeHandROIVector(HandROIVector *p);
 #ifdef __cplusplus
 }
 #endif
 #endif // !_HAND_COMMON_C_H_
--- a/src/hand/common/common.cpp
+++ b/src/hand/common/common.cpp
@@ -1,265 +0,0 @@
 #include "../common.h"
 #include <float.h>
 #include <math.h>
 void FreeHandROIVector(HandROIVector *p) {
    if (p->items!=NULL) {
        free(p->items);
        p->items= NULL;
    }
 }
 namespace ovhand {
 inline float intersection_area(const HandROI& a, const HandROI& b)
 {
    ov::Rect inter = a.rect & b.rect;
    return inter.area();
 }
 void qsort_descent_inplace(std::vector<HandROI>& objects, int left, int right)
 {
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;
    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;
        while (objects[j].prob < p)
            j--;
        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);
            i++;
            j--;
        }
    }
    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
 }
 void qsort_descent_inplace(std::vector<HandROI>& objects)
 {
    if (objects.empty())
        return;
    qsort_descent_inplace(objects, 0, objects.size() - 1);
 }
 void nms_sorted_bboxes(const std::vector<HandROI>& objects, std::vector<int>& picked, float nms_threshold)
 {
    picked.clear();
    const int n = objects.size();
    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rect.area();
    }
    for (int i = 0; i < n; i++)
    {
        const HandROI& a = objects[i];
        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const HandROI& b = objects[picked[j]];
            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }
        if (keep)
            picked.push_back(i);
    }
 }
 int generate_grids_and_stride(const int target_size, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
 {
    for (auto stride : strides)
    {
        int num_grid = target_size / stride;
        for (int g1 = 0; g1 < num_grid; g1++)
        {
            for (int g0 = 0; g0 < num_grid; g0++)
            {
                grid_strides.push_back((GridAndStride){g0, g1, stride});
            }
        }
    }
    return 0;
 }
 void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<HandROI>& objects)
 {
    const int num_grid = feat_blob.h;
    const int num_class = feat_blob.w - 5;
    const int num_anchors = grid_strides.size();
    const float* feat_ptr = feat_blob.channel(0);
    for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
    {
        const int grid0 = grid_strides[anchor_idx].grid0;
        const int grid1 = grid_strides[anchor_idx].grid1;
        const int stride = grid_strides[anchor_idx].stride;
        // yolox/models/yolo_head.py decode logic
        //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
        //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
        float x_center = (feat_ptr[0] + grid0) * stride;
        float y_center = (feat_ptr[1] + grid1) * stride;
        float w = exp(feat_ptr[2]) * stride;
        float h = exp(feat_ptr[3]) * stride;
        float x0 = x_center - w * 0.5f;
        float y0 = y_center - h * 0.5f;
        float box_objectness = feat_ptr[4];
        for (int class_idx = 0; class_idx < num_class; class_idx++)
        {
            float box_cls_score = feat_ptr[5 + class_idx];
            float box_prob = box_objectness * box_cls_score;
            if (box_prob > prob_threshold)
            {
                HandROI obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = w;
                obj.rect.height = h;
                obj.label = class_idx;
                obj.prob = box_prob;
                objects.push_back(obj);
            }
        } // class loop
        feat_ptr += feat_blob.w;
    } // point anchor loop
 }
 void generate_nanodet_proposals(const ncnn::Mat& cls_pred, const ncnn::Mat& dis_pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<HandROI>& objects)
 {
    const int num_grid = cls_pred.h;
    int num_grid_x;
    int num_grid_y;
    if (in_pad.w > in_pad.h)
    {
        num_grid_x = in_pad.w / stride;
        num_grid_y = num_grid / num_grid_x;
    }
    else
    {
        num_grid_y = in_pad.h / stride;
        num_grid_x = num_grid / num_grid_y;
    }
    const int num_class = cls_pred.w;
    const int reg_max_1 = dis_pred.w / 4;
    //__android_log_print(ANDROID_LOG_WARN, "ncnn","cls_pred h %d, w %d",cls_pred.h,cls_pred.w);
    //__android_log_print(ANDROID_LOG_WARN, "ncnn","%d,%d,%d,%d",num_grid_x,num_grid_y,num_class,reg_max_1);
    for (int i = 0; i < num_grid_y; i++)
    {
        for (int j = 0; j < num_grid_x; j++)
        {
            const int idx = i * num_grid_x + j;
            const float* scores = cls_pred.row(idx);
            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            for (int k = 0; k < num_class; k++)
            {
                if (scores[k] > score)
                {
                    label = k;
                    score = scores[k];
                }
            }
            if (score >= prob_threshold)
            {
                ncnn::Mat bbox_pred(reg_max_1, 4, (void*)dis_pred.row(idx));
                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");
                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);
                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;
                    softmax->create_pipeline(opt);
                    softmax->forward_inplace(bbox_pred, opt);
                    softmax->destroy_pipeline(opt);
                    delete softmax;
                }
                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = bbox_pred.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }
                    pred_ltrb[k] = dis * stride;
                }
                float pb_cx = (j + 0.5f) * stride;
                float pb_cy = (i + 0.5f) * stride;
                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];
                HandROI obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;
                objects.push_back(obj);
            }
        }
    }
 }
 }
--- a/src/hand/common/common.hpp
+++ b/src/hand/common/common.hpp
@@ -1,35 +0,0 @@
 #ifndef _HAND_COMMON_H_
 #define _HAND_COMMON_H_
 #include "../../common/common.h"
 #include "net.h"
 #include <vector>
 namespace ovhand {
 struct HandROI {
    ov::Rect rect;
    int label;
    float prob;
 };
 struct GridAndStride
 {
    int grid0;
    int grid1;
    int stride;
 };
 inline float intersection_area(const HandROI& a, const HandROI& b);
 void qsort_descent_inplace(std::vector<HandROI>& objects, int left, int right);
 void qsort_descent_inplace(std::vector<HandROI>& objects);
 void nms_sorted_bboxes(const std::vector<HandROI>& objects, std::vector<int>& picked, float nms_threshold);
 int generate_grids_and_stride(const int target_size, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides);
 void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<HandROI>& objects);
 void generate_nanodet_proposals(const ncnn::Mat& cls_pred, const ncnn::Mat& dis_pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<HandROI>& objects);
 }
 #endif // !_HAND_COMMON_H_
--- a/src/hand/detecter.h
+++ b/src/hand/detecter.h
@@ -1,7 +1,7 @@
 #ifndef _HAND_DETECTER_C_H_
 #define _HAND_DETECTER_C_H_
-#include "common.h"
+#include "../common/common.h"
 #ifdef __cplusplus
 #include "detecter/detecter.hpp"
@@ -12,7 +12,7 @@ extern "C" {
    IHandDetecter new_nanodet();
    int extract_hand_rois(IHandDetecter d, const unsigned char* rgbdata,
        int img_width, int img_height,
-        HandROIVector* rois);
+        ObjectInfoVector* rois);
 #ifdef __cplusplus
 }
 #endif
--- a/src/hand/detecter/detecter.cpp
+++ b/src/hand/detecter/detecter.cpp
@@ -10,17 +10,23 @@ IHandDetecter new_nanodet() {
    return new ovhand::Nanodet();
 }
-int extract_hand_rois(IHandDetecter d, const unsigned char* rgbdata, int img_width, int img_height, HandROIVector* rois) {
+int extract_hand_rois(IHandDetecter d, const unsigned char* rgbdata, int img_width, int img_height, ObjectInfoVector* rois) {
-	std::vector<HandROI> detected;
+	std::vector<ov::ObjectInfo> detected;
-    int ret = static_cast<ovhand::Detecter*>(d)->Detect(rgbdata, img_width, img_height, &detected);
+    int ret = static_cast<ovhand::Detecter*>(d)->Detect(rgbdata, img_width, img_height, detected);
    if (ret != 0) {
        return ret;
    }
    rois->length = detected.size();
-    rois->items = (HandROI*)malloc(rois->length * sizeof(HandROI));
+    rois->items = (ObjectInfo*)malloc(rois->length * sizeof(ObjectInfo));
    for (size_t i = 0; i < detected.size(); ++i) {
-        rois->items[i] = detected[i];
+        ov::ObjectInfo o = detected[i];
        rois->items[i] = ObjectInfo{
            o.rect,
            o.prob,
            o.label,
            NULL
        };
    }
    return 0;
 }
--- a/src/hand/detecter/detecter.hpp
+++ b/src/hand/detecter/detecter.hpp
@@ -8,7 +8,7 @@ public:
    virtual ~Detecter() {};
    virtual int Detect(const unsigned char*rgbdata,
        int img_width, int img_height,
-        std::vector<HandROI>* rois) = 0;
+        std::vector<ov::ObjectInfo>& rois) = 0;
 };
 class DetecterFactory {
--- a/src/hand/detecter/nanodet/nanodet.cpp
+++ b/src/hand/detecter/nanodet/nanodet.cpp
@@ -1,11 +1,114 @@
 #include "nanodet.hpp"
 #include <string>
 #include <float.h>
 #ifdef OV_VULKAN
 #include "gpu.h"
 #endif // OV_VULKAN
 namespace ovhand {
 static void generate_nanodet_proposals(const ncnn::Mat& cls_pred, const ncnn::Mat& dis_pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<ov::ObjectInfo>& objects)
 {
    const int num_grid = cls_pred.h;
    int num_grid_x;
    int num_grid_y;
    if (in_pad.w > in_pad.h)
    {
        num_grid_x = in_pad.w / stride;
        num_grid_y = num_grid / num_grid_x;
    }
    else
    {
        num_grid_y = in_pad.h / stride;
        num_grid_x = num_grid / num_grid_y;
    }
    const int num_class = cls_pred.w;
    const int reg_max_1 = dis_pred.w / 4;
    //__android_log_print(ANDROID_LOG_WARN, "ncnn","cls_pred h %d, w %d",cls_pred.h,cls_pred.w);
    //__android_log_print(ANDROID_LOG_WARN, "ncnn","%d,%d,%d,%d",num_grid_x,num_grid_y,num_class,reg_max_1);
    for (int i = 0; i < num_grid_y; i++)
    {
        for (int j = 0; j < num_grid_x; j++)
        {
            const int idx = i * num_grid_x + j;
            const float* scores = cls_pred.row(idx);
            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            for (int k = 0; k < num_class; k++)
            {
                if (scores[k] > score)
                {
                    label = k;
                    score = scores[k];
                }
            }
            if (score >= prob_threshold)
            {
                ncnn::Mat bbox_pred(reg_max_1, 4, (void*)dis_pred.row(idx));
                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");
                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);
                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;
                    softmax->create_pipeline(opt);
                    softmax->forward_inplace(bbox_pred, opt);
                    softmax->destroy_pipeline(opt);
                    delete softmax;
                }
                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = bbox_pred.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }
                    pred_ltrb[k] = dis * stride;
                }
                float pb_cx = (j + 0.5f) * stride;
                float pb_cy = (i + 0.5f) * stride;
                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];
                ov::ObjectInfo obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;
                objects.push_back(obj);
            }
        }
    }
 }
 Nanodet::Nanodet() : 
    net_ (new ncnn::Net()),
 	initialized_(false) {
@@ -31,7 +134,7 @@ int Nanodet::LoadModel(const char * root_path) {
 int Nanodet::Detect(const unsigned char* rgbdata,
    int img_width, int img_height, 
-    std::vector<HandROI>* rois) {
+    std::vector<ov::ObjectInfo>& rois) {
 	if (!initialized_) {
 		return 10000;
 	}
@@ -39,8 +142,6 @@ int Nanodet::Detect(const unsigned char* rgbdata,
 		return 10001;
    }
    const int target_size = 320;
    int w = img_width;
    int h = img_height;
    float scale = 1.f;
@@ -62,17 +163,13 @@ int Nanodet::Detect(const unsigned char* rgbdata,
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);
    const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
    const float norm_vals[3] = {1.f / 57.375f, 1.f / 57.12f, 1.f / 58.395f};
    in_pad.substract_mean_normalize(mean_vals, norm_vals);
    ncnn::Extractor ex = net_->create_extractor();
    //__android_log_print(ANDROID_LOG_WARN, "ncnn","input w:%d,h:%d",in_pad.w,in_pad.h);
    ex.input("input.1", in_pad);
-    const float prob_threshold = 0.4f; 
+    std::vector<ov::ObjectInfo> proposals;
    const float nms_threshold = 0.5f;
    std::vector<HandROI> proposals;
    // stride 8
    {
        ncnn::Mat cls_pred;
@@ -80,7 +177,7 @@ int Nanodet::Detect(const unsigned char* rgbdata,
        ex.extract("cls_pred_stride_8", cls_pred);
        ex.extract("dis_pred_stride_8", dis_pred);
-        std::vector<HandROI> objects8;
+        std::vector<ov::ObjectInfo> objects8;
        generate_nanodet_proposals(cls_pred, dis_pred, 8, in_pad, prob_threshold, objects8);
        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
@@ -93,7 +190,7 @@ int Nanodet::Detect(const unsigned char* rgbdata,
        ex.extract("cls_pred_stride_16", cls_pred);
        ex.extract("dis_pred_stride_16", dis_pred);
-        std::vector<HandROI> objects16;
+        std::vector<ov::ObjectInfo> objects16;
        generate_nanodet_proposals(cls_pred, dis_pred, 16, in_pad, prob_threshold, objects16);
        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
@@ -106,7 +203,7 @@ int Nanodet::Detect(const unsigned char* rgbdata,
        ex.extract("cls_pred_stride_32", cls_pred);
        ex.extract("dis_pred_stride_32", dis_pred);
-        std::vector<HandROI> objects32;
+        std::vector<ov::ObjectInfo> objects32;
        generate_nanodet_proposals(cls_pred, dis_pred, 32, in_pad, prob_threshold, objects32);
        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
@@ -120,10 +217,11 @@ int Nanodet::Detect(const unsigned char* rgbdata,
    nms_sorted_bboxes(proposals, picked, nms_threshold);
    int count = picked.size();
    rois.resize(count);
    for (int i = 0; i < count; i++)
    {
-        HandROI roi = proposals[picked[i]];
+        ov::ObjectInfo roi = proposals[picked[i]];
        // adjust offset to original unpadded
        float x0 = (roi.rect.x - (wpad / 2)) / scale;
@@ -142,17 +240,17 @@ int Nanodet::Detect(const unsigned char* rgbdata,
        roi.rect.width = x1 - x0;
        roi.rect.height = y1 - y0;
-        rois->push_back(roi);
+        rois[i] = roi;
    }
    // sort objects by area
    struct
    {
-        bool operator()(const HandROI& a, const HandROI& b) const
+        bool operator()(const ov::ObjectInfo& a, const ov::ObjectInfo& b) const
        {
            return a.rect.area() > b.rect.area();
        }
    } objects_area_greater;
-    std::sort(rois->begin(), rois->end(), objects_area_greater);
+    std::sort(rois.begin(), rois.end(), objects_area_greater);
    return 0;
 }
 }
--- a/src/hand/detecter/nanodet/nanodet.hpp
+++ b/src/hand/detecter/nanodet/nanodet.hpp
@@ -15,11 +15,16 @@ public:
    int LoadModel(const char* root_path);
    int Detect(const unsigned char* rgbadata,
        int img_width, int img_height,
-        std::vector<HandROI>* rois);
+        std::vector<ov::ObjectInfo>& rois);
 private:
 	ncnn::Net* net_;
    bool initialized_;
    const int target_size = 320;
    const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
    const float norm_vals[3] = {1.f / 57.375f, 1.f / 57.12f, 1.f / 58.395f};
    const float prob_threshold = 0.4f; 
    const float nms_threshold = 0.5f;
 };
 }
--- a/src/hand/detecter/yolox/yolox.cpp
+++ b/src/hand/detecter/yolox/yolox.cpp
@@ -8,6 +8,55 @@
 namespace ovhand {
 static void generate_yolox_proposals(std::vector<ov::GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<ov::ObjectInfo>& objects)
 {
    const int num_grid = feat_blob.h;
    const int num_class = feat_blob.w - 5;
    const int num_anchors = grid_strides.size();
    const float* feat_ptr = feat_blob.channel(0);
    for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
    {
        const int grid0 = grid_strides[anchor_idx].grid0;
        const int grid1 = grid_strides[anchor_idx].grid1;
        const int stride = grid_strides[anchor_idx].stride;
        // yolox/models/yolo_head.py decode logic
        //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
        //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
        float x_center = (feat_ptr[0] + grid0) * stride;
        float y_center = (feat_ptr[1] + grid1) * stride;
        float w = exp(feat_ptr[2]) * stride;
        float h = exp(feat_ptr[3]) * stride;
        float x0 = x_center - w * 0.5f;
        float y0 = y_center - h * 0.5f;
        float box_objectness = feat_ptr[4];
        for (int class_idx = 0; class_idx < num_class; class_idx++)
        {
            float box_cls_score = feat_ptr[5 + class_idx];
            float box_prob = box_objectness * box_cls_score;
            if (box_prob > prob_threshold)
            {
                ov::ObjectInfo obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = w;
                obj.rect.height = h;
                obj.label = class_idx;
                obj.prob = box_prob;
                objects.push_back(obj);
            }
        } // class loop
        feat_ptr += feat_blob.w;
    } // point anchor loop
 }
 Yolox::Yolox() : 
    net_ (new ncnn::Net()),
 	initialized_(false) {
@@ -34,7 +83,7 @@ int Yolox::LoadModel(const char * root_path) {
 int Yolox::Detect(const unsigned char* rgbdata,
    int img_width, int img_height, 
-    std::vector<HandROI>* rois) {
+    std::vector<ov::ObjectInfo>& rois) {
 	if (!initialized_) {
 		return 10000;
 	}
@@ -42,8 +91,6 @@ int Yolox::Detect(const unsigned char* rgbdata,
 		return 10001;
    }
    const int target_size = 416;
    int w = img_width;
    int h = img_height;
    float scale = 1.f;
@@ -65,8 +112,6 @@ int Yolox::Detect(const unsigned char* rgbdata,
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);
    const float mean_vals[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f};
    const float norm_vals[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)};
    // so for 0-255 input image, rgb_mean should multiply 255 and norm should div by std.
    in_pad.substract_mean_normalize(mean_vals, norm_vals);
@@ -76,13 +121,11 @@ int Yolox::Detect(const unsigned char* rgbdata,
    ncnn::Mat out;
    ex.extract("output", out);
-    const float prob_threshold = 0.45f; 
+    std::vector<ov::ObjectInfo> proposals;
    const float nms_threshold = 0.65f;
    std::vector<HandROI> proposals;
    {
        std::vector<int> strides = {8, 16, 32}; // might have stride=64
-        std::vector<GridAndStride> grid_strides;
+        std::vector<ov::GridAndStride> grid_strides;
        generate_grids_and_stride(target_size, strides, grid_strides);
        generate_yolox_proposals(grid_strides, out, prob_threshold, proposals);
    }
@@ -96,9 +139,11 @@ int Yolox::Detect(const unsigned char* rgbdata,
    int count = picked.size();
    rois.resize(count);
    for (int i = 0; i < count; i++)
    {
-        HandROI roi = proposals[picked[i]];
+        ov::ObjectInfo roi = proposals[picked[i]];
        // adjust offset to original unpadded
        float x0 = (roi.rect.x) / scale;
@@ -117,7 +162,7 @@ int Yolox::Detect(const unsigned char* rgbdata,
        roi.rect.width = x1 - x0;
        roi.rect.height = y1 - y0;
-        rois->push_back(roi);
+        rois[i] = roi;
    } 
    return 0;
--- a/src/hand/detecter/yolox/yolox.hpp
+++ b/src/hand/detecter/yolox/yolox.hpp
@@ -15,11 +15,16 @@ public:
    int LoadModel(const char* root_path);
    int Detect(const unsigned char* rgbadata,
        int img_width, int img_height,
-        std::vector<HandROI>* rois);
+        std::vector<ov::ObjectInfo>& rois);
 private:
 	ncnn::Net* net_;
    bool initialized_;
    const int target_size = 416;
    const float mean_vals[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f};
    const float norm_vals[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)};
    const float prob_threshold = 0.45f; 
    const float nms_threshold = 0.65f;
 };
 }
--- a/src/hand/pose.h
+++ b/src/hand/pose.h
@@ -1,7 +1,7 @@
 #ifndef _HAND_POSE_C_H_
 #define _HAND_POSE_C_H_
-#include "common.h"
+#include "../common/common.h"
 #ifdef __cplusplus
 #include "pose/estimator.hpp"
--- a/src/hand/pose/estimator.cpp
+++ b/src/hand/pose/estimator.cpp
@@ -10,9 +10,9 @@ int hand_pose(IHandPoseEstimator d, const unsigned char* rgbdata,
    int img_width, int img_height,
    const Rect* rect,
    Point2fVector* keypoints) {
-    std::vector<ov::Point2f>points;
+    std::vector<ov::Point2f> points;
-    int ret = static_cast<ovhand::HandPose*>(d)->Detect(rgbdata, img_width, img_height, *rect, &points);
+    int ret = static_cast<ovhand::HandPose*>(d)->Detect(rgbdata, img_width, img_height, *rect, points);
    if (ret != 0) {
        return ret;
    }
--- a/src/hand/pose/estimator.hpp
+++ b/src/hand/pose/estimator.hpp
@@ -11,7 +11,7 @@ public:
    virtual int Detect(const unsigned char*rgbdata,
        int img_width, int img_height,
        const ov::Rect& rect,
-        std::vector<ov::Point2f>* keypoints) = 0;
+        std::vector<ov::Point2f>& keypoints) = 0;
 };
 class PoseEstimatorFactory {
--- a/src/hand/pose/handpose/handpose.cpp
+++ b/src/hand/pose/handpose/handpose.cpp
@@ -32,8 +32,8 @@ int HandPose::LoadModel(const char * root_path) {
 int HandPose::Detect(const unsigned char* rgbdata,
    int img_width, int img_height,
    const ov::Rect& rect,
-    std::vector<ov::Point2f>* keypoints) {
+    std::vector<ov::Point2f>& keypoints) {
-    keypoints->clear(); 
+    keypoints.clear(); 
 	if (!initialized_) {
 		return 10000;
 	}
@@ -58,6 +58,8 @@ int HandPose::Detect(const unsigned char* rgbdata,
    ex1.input("input", ncnn_in);
    ncnn::Mat ncnn_out;
    ex1.extract("output", ncnn_out);
    keypoints.resize(21);
    for (int c = 0; c < ncnn_out.c; c++)
    {
        ncnn::Mat data = ncnn_out.channel(c);
@@ -66,8 +68,7 @@ int HandPose::Detect(const unsigned char* rgbdata,
        {
            float pt_x = ptr[j * 2] * rect.width;
            float pt_y = ptr[j * 2 + 1] * rect.height;
-            keypoints->push_back(ov::Point2f(pt_x + rect.x, pt_y + rect.y));
+            keypoints[j] = ov::Point2f(pt_x + rect.x, pt_y + rect.y);
        }
    }
    free(crop_img);
--- a/src/hand/pose/handpose/handpose.hpp
+++ b/src/hand/pose/handpose/handpose.hpp
@@ -16,11 +16,13 @@ public:
    int Detect(const unsigned char* rgbdata,
        int img_width, int img_height,
        const ov::Rect& rect,
-        std::vector<ov::Point2f>* keypoints);
+        std::vector<ov::Point2f>& keypoints);
 private:
 	ncnn::Net* net_;
    bool initialized_;
    const float meanVals[3] = { 128.0f, 128.0f,  128.0f };
    const float normVals[3] = { 0.00390625f, 0.00390625f, 0.00390625f };
 };
 }
--- a/src/pose/common.h
+++ b/src/pose/common.h
@@ -1,41 +0,0 @@
 #ifndef _POSE_COMMON_C_H_
 #define _POSE_COMMON_C_H_
 #include "../common/common.h"
 #ifdef __cplusplus
 #include "common/common.hpp"
 extern "C" {
 #endif
 #ifdef __cplusplus
 typedef ovpose::PoseKeypoint PoseKeypoint;
 typedef ovpose::PoseROI PoseROI;
 #else
 typedef struct PoseKeypoint {
    Point2f p;
    float prob;
 } PoseKeypoint;
 typedef struct PoseROI {
    Rect rect;
    unsigned char *data;
    float score;
 } PoseROI;
 #endif
 typedef struct PoseROIVector {
    PoseROI* items;
    int length;
 } PoseROIVector;
 typedef struct PoseKeypointVector {
    PoseKeypoint* points;
    int length;
 } PoseKeypointVector;
 void FreePoseKeypointVector(PoseKeypointVector *p);
 void FreePoseROI(PoseROI *p);
 void FreePoseROIVector(PoseROIVector *p);
 #ifdef __cplusplus
 }
 #endif
 #endif // !_POSE_COMMON_C_H_
--- a/src/pose/common/common.cpp
+++ b/src/pose/common/common.cpp
@@ -1,25 +0,0 @@
 #include "../common.h"
 void FreePoseKeypointVector(PoseKeypointVector *p) {
    if (p->points != NULL) {
        free(p->points);
        p->points = NULL;
    }
 }
 void FreePoseROI(PoseROI *p) {
    if (p->data!= NULL) {
        free(p->data);
        p->data= NULL;
    }
 }
 void FreePoseROIVector(PoseROIVector *p) {
    if (p->items!= NULL) {
        for (int i=0; i < p->length; i ++) {
            FreePoseROI(&p->items[i]);
        }
        free(p->items);
        p->items= NULL;
    }
 }
--- a/src/pose/common/common.hpp
+++ b/src/pose/common/common.hpp
@@ -1,18 +0,0 @@
 #ifndef _POSE_COMMON_H_
 #define _POSE_COMMON_H_
 #include "../../common/common.h"
 namespace ovpose {
 struct PoseKeypoint {
    ov::Point2f p;
    float prob;
 };
 struct PoseROI {
    ov::Rect rect;
    unsigned char *data;
    float score;
 };
 }
 #endif // !_POSE_COMMON_H_
--- a/src/pose/detecter.h
+++ b/src/pose/detecter.h
@@ -1,7 +1,7 @@
 #ifndef _POSE_DETECTER_C_H_
 #define _POSE_DETECTER_C_H_
-#include "common.h"
+#include "../common/common.h"
 #ifdef __cplusplus
 #include "detecter/detecter.hpp"
@@ -11,8 +11,10 @@ extern "C" {
    IPoseDetecter new_ultralight();
    int extract_pose_rois(IPoseDetecter d, const unsigned char* rgbdata,
        int img_width, int img_height,
-        PoseROIVector* rois);
+        ObjectInfoVector* rois);
-    int extract_pose_keypoints(IPoseDetecter d, const PoseROI* roi, PoseKeypointVector* keypoints); 
+    int extract_pose_keypoints(IPoseDetecter d, const unsigned char* rgbdata,
        int img_width, int img_height,
        const Rect* rect, KeypointVector* keypoints); 
 #ifdef __cplusplus
 }
 #endif
--- a/src/pose/detecter/detecter.cpp
+++ b/src/pose/detecter/detecter.cpp
@@ -5,29 +5,35 @@ IPoseDetecter new_ultralight() {
    return new ovpose::Ultralight();
 }
-int extract_pose_rois(IPoseDetecter d, const unsigned char* rgbdata, int img_width, int img_height, PoseROIVector* rois) {
+int extract_pose_rois(IPoseDetecter d, const unsigned char* rgbdata, int img_width, int img_height, ObjectInfoVector* rois) {
-	std::vector<PoseROI> detected;
+	std::vector<ov::ObjectInfo> detected;
    int ret = static_cast<ovpose::Detecter*>(d)->ExtractROIs(rgbdata, img_width, img_height, &detected);
    if (ret != 0) {
        return ret;
    }
    rois->length = detected.size();
-    rois->items = (PoseROI*)malloc(rois->length * sizeof(PoseROI));
+    rois->items = (ObjectInfo*)malloc(rois->length * sizeof(ObjectInfo));
    for (size_t i = 0; i < detected.size(); ++i) {
-        rois->items[i] = detected[i];
+        ov::ObjectInfo o = detected[i];
        rois->items[i] = ObjectInfo{
            o.rect,
            o.prob,
            o.label,
            NULL
        };
    }
    return 0;
 }
-int extract_pose_keypoints(IPoseDetecter d, const PoseROI* roi, PoseKeypointVector* keypoints) {
+int extract_pose_keypoints(IPoseDetecter d, const unsigned char* rgbdata, int img_width, int img_height, const Rect* rect, KeypointVector* keypoints) {
-    std::vector<PoseKeypoint> points;
+    std::vector<ov::Keypoint> points;
-    int ret = static_cast<ovpose::Detecter*>(d)->ExtractKeypoints(*roi, &points);
+    int ret = static_cast<ovpose::Detecter*>(d)->ExtractKeypoints(rgbdata, img_width, img_height, *rect, &points);
    if (ret != 0) {
        return ret;
    }
    keypoints->length = points.size();
-    keypoints->points = (PoseKeypoint*)malloc(keypoints->length * sizeof(PoseKeypoint));
+    keypoints->points = (Keypoint*)malloc(keypoints->length * sizeof(Keypoint));
    for (size_t i = 0; i < points.size(); ++i) {
        keypoints->points[i] = points[i];
    }
--- a/src/pose/detecter/detecter.hpp
+++ b/src/pose/detecter/detecter.hpp
@@ -10,8 +10,10 @@ public:
    virtual ~Detecter(){};
    virtual int ExtractROIs(const unsigned char* rgbadata,
        int img_width, int img_height,
-        std::vector<PoseROI>* rois) = 0;
+        std::vector<ov::ObjectInfo>* rois) = 0;
-    virtual int ExtractKeypoints(const PoseROI& roi, std::vector<PoseKeypoint>* keypoints) = 0;
+    virtual int ExtractKeypoints(const unsigned char* rgbdata,
        int img_width, int img_height,
        const ov::Rect& rect, std::vector<ov::Keypoint>* keypoints) = 0;
 };
 class DetecterFactory {
--- a/src/pose/detecter/ultralight/ultralight.cpp
+++ b/src/pose/detecter/ultralight/ultralight.cpp
@@ -42,7 +42,7 @@ int Ultralight::LoadModel(const char * root_path) {
 int Ultralight::ExtractROIs(const unsigned char* rgbdata,
    int img_width, int img_height,
-    std::vector<PoseROI>* rois) {
+    std::vector<ov::ObjectInfo>* rois) {
 	if (!initialized_) {
 		return 10000;
 	}
@@ -52,8 +52,6 @@ int Ultralight::ExtractROIs(const unsigned char* rgbdata,
 	ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgbdata,
 		ncnn::Mat::PIXEL_RGB, img_width, img_height, 320, 320);
    //数据预处理
    const float mean_vals[3] = {0.f, 0.f, 0.f};
    const float norm_vals[3] = {1/255.f, 1/255.f, 1/255.f};
    in.substract_mean_normalize(mean_vals, norm_vals);
    ncnn::Extractor ex = roi_net_->create_extractor();
@@ -98,32 +96,32 @@ int Ultralight::ExtractROIs(const unsigned char* rgbdata,
        if(y2>img_height) y2=img_height;
        //截取人体ROI
        //printf("x1:%f y1:%f x2:%f y2:%f\n",x1,y1,x2,y2);
-        Rect rect = ov::Rect(x1, y1, x2-x1, y2-y1);
+        ov::Rect rect = ov::Rect(x1, y1, x2-x1, y2-y1);
-        size_t total_size = rect.width * rect.height * 3 * sizeof(unsigned char);
+        ov::ObjectInfo roi;
        PoseROI roi;
        roi.rect = rect;
-        roi.score = score;
+        roi.prob = score;
        roi.data = (unsigned char*)malloc(total_size);
        const unsigned char *start_ptr = rgbdata;
        for(size_t i = 0; i < rect.height; ++i) {
            const unsigned char* srcCursor = start_ptr + ((i + rect.y) * img_width + rect.x) * 3; 
            unsigned char* dstCursor = roi.data + i * rect.width * 3;
            memcpy(dstCursor, srcCursor, sizeof(unsigned char) * 3 * rect.width);
        }
        rois->push_back(roi);
    }
    return 0;
 }
-int Ultralight::ExtractKeypoints(const PoseROI& roi, std::vector<PoseKeypoint>* keypoints) {
+int Ultralight::ExtractKeypoints(const unsigned char* rgbdata, 
    int img_width, int img_height,
    const ov::Rect& rect, std::vector<ov::Keypoint>* keypoints) {
    keypoints->clear();
-    int w = roi.rect.width;
+    int w = rect.width;
-    int h = roi.rect.height;
+    int h = rect.height;
-    ncnn::Mat in = ncnn::Mat::from_pixels_resize(roi.data, ncnn::Mat::PIXEL_RGB, w, h, 192, 256);
+    size_t total_size = w * h * 3 * sizeof(unsigned char);
    unsigned char* data = (unsigned char*)malloc(total_size);
    const unsigned char *start_ptr = rgbdata;
    for(size_t i = 0; i < h; ++i) {
        const unsigned char* srcCursor = start_ptr + ((i + rect.y) * img_width + rect.x) * 3; 
        unsigned char* dstCursor = data + i * w * 3;
        memcpy(dstCursor, srcCursor, sizeof(unsigned char) * 3 * w);
    }
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(data, ncnn::Mat::PIXEL_RGB, w, h, 192, 256);
    //数据预处理
-    const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f};
+    in.substract_mean_normalize(meanVals, normVals);
    const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f};
    in.substract_mean_normalize(mean_vals, norm_vals);
    ncnn::Extractor ex = pose_net_->create_extractor();
    ex.set_num_threads(4);
@@ -152,11 +150,13 @@ int Ultralight::ExtractKeypoints(const PoseROI& roi, std::vector<PoseKeypoint>*
            }
        }
-        PoseKeypoint keypoint;
+        ov::Keypoint keypoint;
-        keypoint.p = Point2f(max_x * w / (float)out.w+roi.rect.x, max_y * h / (float)out.h+roi.rect.y);
+        keypoint.p = ov::Point2f(max_x * w / (float)out.w+rect.x, max_y * h / (float)out.h+rect.y);
        keypoint.prob = max_prob;
        keypoints->push_back(keypoint);
    }
    free(data);
    return 0;
 }
--- a/src/pose/detecter/ultralight/ultralight.hpp
+++ b/src/pose/detecter/ultralight/ultralight.hpp
@@ -15,14 +15,20 @@ public:
 	int LoadModel(const char* root_path);
    int ExtractROIs(const unsigned char* rgbadata,
        int img_width, int img_height,
-        std::vector<PoseROI>* rois);
+        std::vector<ov::ObjectInfo>* rois);
-	int ExtractKeypoints(const PoseROI& roi, 
+	int ExtractKeypoints(const unsigned char* rgbdata,
-        std::vector<PoseKeypoint>* keypoints);
+        int img_width, int img_height,
        const ov::Rect& rect, 
        std::vector<ov::Keypoint>* keypoints);
 private:
 	ncnn::Net* roi_net_;
 	ncnn::Net* pose_net_;
 	bool initialized_;
    const float mean_vals[3] = {0.f, 0.f, 0.f};
    const float norm_vals[3] = {1/255.f, 1/255.f, 1/255.f};
    const float meanVals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f};
    const float normVals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f};
 };
 }