perf: avoid unnecessary memory allocation

This commit is contained in:
esimov
2021-10-22 08:26:56 +03:00
parent 08dd5acac8
commit 494cb7ab59
4 changed files with 189 additions and 171 deletions

View File

@@ -5,6 +5,7 @@ import (
"io/ioutil"
"math"
"path/filepath"
"sync"
)
// FlpCascade holds the binary representation of the facial landmark points cascade files
@@ -13,6 +14,12 @@ type FlpCascade struct {
error
}
var flplocPool = sync.Pool{
New: func() interface{} {
return &Puploc{}
},
}
// UnpackFlp unpacks the facial landmark points cascade file.
// This will return the binary representation of the cascade file.
func (plc *PuplocCascade) UnpackFlp(cf string) (*PuplocCascade, error) {
@@ -25,7 +32,6 @@ func (plc *PuplocCascade) UnpackFlp(cf string) (*PuplocCascade, error) {
// GetLandmarkPoint retrieves the facial landmark point based on the pupil localization results.
func (plc *PuplocCascade) GetLandmarkPoint(leftEye, rightEye *Puploc, img ImageParams, perturb int, flipV bool) *Puploc {
var flploc *Puploc
dx := (leftEye.Row - rightEye.Row) * (leftEye.Row - rightEye.Row)
dy := (leftEye.Col - rightEye.Col) * (leftEye.Col - rightEye.Col)
dist := math.Sqrt(float64(dx + dy))
@@ -34,12 +40,13 @@ func (plc *PuplocCascade) GetLandmarkPoint(leftEye, rightEye *Puploc, img ImageP
col := float64(leftEye.Col+rightEye.Col)/2.0 + 0.15*dist
scale := 3.0 * dist
flploc = &Puploc{
Row: int(row),
Col: int(col),
Scale: float32(scale),
Perturbs: perturb,
}
flploc := flplocPool.Get().(*Puploc)
defer flplocPool.Put(flploc)
flploc.Row = int(row)
flploc.Col = int(col)
flploc.Scale = float32(scale)
flploc.Perturbs = perturb
if flipV {
return plc.RunDetector(*flploc, img, 0.0, true)
@@ -50,14 +57,16 @@ func (plc *PuplocCascade) GetLandmarkPoint(leftEye, rightEye *Puploc, img ImageP
// ReadCascadeDir reads the facial landmark points cascade files from the provided directory.
func (plc *PuplocCascade) ReadCascadeDir(path string) (map[string][]*FlpCascade, error) {
cascades, err := ioutil.ReadDir(path)
if len(cascades) == 0 {
return nil, errors.New("the provided directory is empty")
}
flpcs := make(map[string][]*FlpCascade)
if err != nil {
return nil, err
}
if len(cascades) == 0 {
return nil, errors.New("the provided directory is empty")
}
flpcs := make(map[string][]*FlpCascade, len(cascades))
for _, cascade := range cascades {
cf, err := filepath.Abs(path + "/" + cascade.Name())
if err != nil {

View File

@@ -1,10 +1,10 @@
package pigo
import (
"bytes"
"encoding/binary"
"math"
"sort"
"sync"
"unsafe"
)
@@ -59,63 +59,47 @@ func (pg *Pigo) Unpack(packet []byte) (*Pigo, error) {
// We skip the first 8 bytes of the cascade file.
pos := 8
buff := make([]byte, 4)
dataView := bytes.NewBuffer(buff)
// Read the depth (size) of each tree and write it into the buffer array.
_, err := dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
// Obtain the depth of each tree from the binary data.
treeDepth = binary.LittleEndian.Uint32(packet[pos:])
pos += 4
if dataView.Len() > 0 {
treeDepth = binary.LittleEndian.Uint32(packet[pos:])
pos += 4
// Get the number of cascade trees as 32-bit unsigned integer.
treeNum = binary.LittleEndian.Uint32(packet[pos:])
// Get the number of cascade trees as 32-bit unsigned integer and write it into the buffer array.
_, err := dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
// To avoid constant memory allocation on each append we predefine the slice capacity.
treeThreshold = make([]float32, 0, treeNum)
treeCodes = make([]int8, 0, 119808)
treePred = make([]float32, 0, 29952)
treeNum = binary.LittleEndian.Uint32(packet[pos:])
pos += 4
pos += 4
for t := 0; t < int(treeNum); t++ {
treeCodes = append(treeCodes, []int8{0, 0, 0, 0}...)
for t := 0; t < int(treeNum); t++ {
// Obtain the tree codes of each tree nodes.
treeCodes = append(treeCodes, []int8{0, 0, 0, 0}...)
code := packet[pos : pos+int(4*math.Pow(2, float64(treeDepth))-4)]
// Convert unsigned bytecodes to signed ones.
signedCode := *(*[]int8)(unsafe.Pointer(&code))
treeCodes = append(treeCodes, signedCode...)
code := packet[pos : pos+int(4*pow(2, int(treeDepth))-4)]
// Convert unsigned bytecodes to signed ones.
signedCode := *(*[]int8)(unsafe.Pointer(&code))
treeCodes = append(treeCodes, signedCode...)
pos += int(4*math.Pow(2, float64(treeDepth)) - 4)
pos += int(4*pow(2, int(treeDepth)) - 4)
// Read prediction from tree's leaf nodes.
for i := 0; i < int(math.Pow(2, float64(treeDepth))); i++ {
_, err := dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
u32pred := binary.LittleEndian.Uint32(packet[pos:])
// Convert uint32 to float32
f32pred := *(*float32)(unsafe.Pointer(&u32pred))
treePred = append(treePred, f32pred)
pos += 4
}
// Read tree nodes threshold values.
_, err := dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
u32thr := binary.LittleEndian.Uint32(packet[pos:])
// Read prediction from tree's leaf nodes.
for i := 0; i < int(pow(2, int(treeDepth))); i++ {
u32pred := binary.LittleEndian.Uint32(packet[pos:])
// Convert uint32 to float32
f32thr := *(*float32)(unsafe.Pointer(&u32thr))
treeThreshold = append(treeThreshold, f32thr)
f32pred := *(*float32)(unsafe.Pointer(&u32pred))
treePred = append(treePred, f32pred)
pos += 4
}
u32thr := binary.LittleEndian.Uint32(packet[pos:])
// Convert uint32 to float32
f32thr := *(*float32)(unsafe.Pointer(&u32thr))
treeThreshold = append(treeThreshold, f32thr)
pos += 4
}
return &Pigo{
treeDepth,
treeNum,
@@ -126,11 +110,10 @@ func (pg *Pigo) Unpack(packet []byte) (*Pigo, error) {
}
// classifyRegion constructs the classification function based on the parsed binary data.
func (pg *Pigo) classifyRegion(r, c, s int, pixels []uint8, dim int) float32 {
func (pg *Pigo) classifyRegion(r, c, s, treeDepth int, pixels []uint8, dim int) float32 {
var (
root int
out float32
treeDepth = int(math.Pow(2, float64(pg.treeDepth)))
root int
out float32
)
r = r * 256
@@ -139,7 +122,6 @@ func (pg *Pigo) classifyRegion(r, c, s int, pixels []uint8, dim int) float32 {
if pg.treeNum > 0 {
for i := 0; i < int(pg.treeNum); i++ {
idx := 1
for j := 0; j < int(pg.treeDepth); j++ {
x1 := ((r+int(pg.treeCodes[root+4*idx+0])*s)>>8)*dim + ((c + int(pg.treeCodes[root+4*idx+1])*s) >> 8)
x2 := ((r+int(pg.treeCodes[root+4*idx+2])*s)>>8)*dim + ((c + int(pg.treeCodes[root+4*idx+3])*s) >> 8)
@@ -165,11 +147,10 @@ func (pg *Pigo) classifyRegion(r, c, s int, pixels []uint8, dim int) float32 {
}
// classifyRotatedRegion applies the face classification function over a rotated image based on the parsed binary data.
func (pg *Pigo) classifyRotatedRegion(r, c, s int, a float64, nrows, ncols int, pixels []uint8, dim int) float32 {
func (pg *Pigo) classifyRotatedRegion(r, c, s, treeDepth int, a float64, nrows, ncols int, pixels []uint8, dim int) float32 {
var (
root int
out float32
treeDepth = int(math.Pow(2, float64(pg.treeDepth)))
root int
out float32
)
qCosTable := []int{256, 251, 236, 212, 181, 142, 97, 49, 0, -49, -97, -142, -181, -212, -236, -251, -256, -251, -236, -212, -181, -142, -97, -49, 0, 49, 97, 142, 181, 212, 236, 251, 256}
@@ -218,15 +199,28 @@ type Detection struct {
Q float32
}
// We are using sync.Pool to avoid memory heap allocation
// in order to keep the GC overhead as small as possible.
var detpool = sync.Pool{
New: func() interface{} {
return &Detection{}
},
}
// RunCascade analyze the grayscale converted image pixel data and run the classification function over the detection window.
// It will return a slice containing the detection row, column, it's center and the detection score (in case this is greater than 0.0).
func (pg *Pigo) RunCascade(cp CascadeParams, angle float64) []Detection {
var detections []Detection
var pixels = cp.Pixels
var q float32
var (
detections []Detection
pixels = cp.Pixels
treeDepth = int(pow(2, int(pg.treeDepth)))
q float32
)
scale := cp.MinSize
det := detpool.Get().(*Detection)
defer detpool.Put(det)
// Run the classification function over the detection window
// and check if the false positive rate is above a certain value.
for scale <= cp.MaxSize {
@@ -239,13 +233,18 @@ func (pg *Pigo) RunCascade(cp CascadeParams, angle float64) []Detection {
if angle > 1.0 {
angle = 1.0
}
q = pg.classifyRotatedRegion(row, col, scale, angle, cp.Rows, cp.Cols, pixels, cp.Dim)
q = pg.classifyRotatedRegion(row, col, scale, treeDepth, angle, cp.Rows, cp.Cols, pixels, cp.Dim)
} else {
q = pg.classifyRegion(row, col, scale, pixels, cp.Dim)
q = pg.classifyRegion(row, col, scale, treeDepth, pixels, cp.Dim)
}
det.Row = row
det.Col = col
det.Scale = scale
det.Q = q
if q > 0.0 {
detections = append(detections, Detection{row, col, scale, q})
detections = append(detections, *det)
}
}
}

View File

@@ -1,11 +1,11 @@
package pigo
import (
"bytes"
"encoding/binary"
"math"
"math/rand"
"sort"
"sync"
"unsafe"
)
@@ -41,86 +41,57 @@ func (plc *PuplocCascade) UnpackCascade(packet []byte) (*PuplocCascade, error) {
scales float32
trees uint32
treeDepth uint32
treeCodes []int8
treePreds []float32
treeCodes = make([]int8, 0, 409200)
treePreds = make([]float32, 0, 204800)
)
pos := 0
buff := make([]byte, 4)
dataView := bytes.NewBuffer(buff)
// Get the number of stages as 32-bit unsigned integer.
stages = binary.LittleEndian.Uint32(packet[pos:])
pos += 4
// Read the depth (size) of each tree and write it into the buffer array.
_, err := dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
// Obtain the scale multiplier (applied after each stage).
u32scales := binary.LittleEndian.Uint32(packet[pos:])
// Convert uint32 to float32
scales = *(*float32)(unsafe.Pointer(&u32scales))
pos += 4
if dataView.Len() > 0 {
// Get the number of stages as 32-bit uint and write it into the buffer array.
stages = binary.LittleEndian.Uint32(packet[pos:])
_, err := dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
pos += 4
// Obtain the number of trees per stage.
trees = binary.LittleEndian.Uint32(packet[pos:])
pos += 4
// Obtain the scale multiplier (applied after each stage) and write it into the buffer array.
u32scales := binary.LittleEndian.Uint32(packet[pos:])
// Convert uint32 to float32
scales = *(*float32)(unsafe.Pointer(&u32scales))
_, err = dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
pos += 4
// Obtain the depth of each tree.
treeDepth = binary.LittleEndian.Uint32(packet[pos:])
pos += 4
// Obtain the number of trees per stage and write it into the buffer array.
trees = binary.LittleEndian.Uint32(packet[pos:])
_, err = dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
pos += 4
// Traverse all the stages of the binary tree.
for s := 0; s < int(stages); s++ {
// Traverse the branches of each stage.
for t := 0; t < int(trees); t++ {
depth := int(pow(2, int(treeDepth)))
// Obtain the depth of each tree and write it into the buffer array.
treeDepth = binary.LittleEndian.Uint32(packet[pos:])
_, err = dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
pos += 4
code := packet[pos : pos+4*depth-4]
// Convert unsigned bytecodes to signed ones.
i8code := *(*[]int8)(unsafe.Pointer(&code))
treeCodes = append(treeCodes, i8code...)
// Traverse all the stages of the binary tree
for s := 0; s < int(stages); s++ {
// Traverse the branches of each stage
for t := 0; t < int(trees); t++ {
depth := int(math.Pow(2, float64(treeDepth)))
pos += 4*depth - 4
code := packet[pos : pos+4*depth-4]
// Convert unsigned bytecodes to signed ones.
i8code := *(*[]int8)(unsafe.Pointer(&code))
treeCodes = append(treeCodes, i8code...)
pos += 4*depth - 4
// Read prediction from tree's leaf nodes.
for i := 0; i < depth; i++ {
for l := 0; l < 2; l++ {
_, err := dataView.Write([]byte{packet[pos+0], packet[pos+1], packet[pos+2], packet[pos+3]})
if err != nil {
return nil, err
}
u32pred := binary.LittleEndian.Uint32(packet[pos:])
// Convert uint32 to float32
f32pred := *(*float32)(unsafe.Pointer(&u32pred))
treePreds = append(treePreds, f32pred)
pos += 4
}
// Read prediction from tree's leaf nodes.
for i := 0; i < depth; i++ {
for l := 0; l < 2; l++ {
u32pred := binary.LittleEndian.Uint32(packet[pos:])
// Convert uint32 to float32
f32pred := *(*float32)(unsafe.Pointer(&u32pred))
treePreds = append(treePreds, f32pred)
pos += 4
}
}
}
}
return &PuplocCascade{
stages: stages,
scales: scales,
@@ -132,11 +103,11 @@ func (plc *PuplocCascade) UnpackCascade(packet []byte) (*PuplocCascade, error) {
}
// classifyRegion applies the face classification function over an image.
func (plc *PuplocCascade) classifyRegion(r, c, s float32, nrows, ncols int, pixels []uint8, dim int, flipV bool) []float32 {
var c1, c2 int
root := 0
treeDepth := int(math.Pow(2, float64(plc.treeDepth)))
func (plc *PuplocCascade) classifyRegion(r, c, s float32, treeDepth, nrows, ncols int, pixels []uint8, dim int, flipV bool) []float32 {
var (
c1, c2 int
root int
)
for i := 0; i < int(plc.stages); i++ {
var dr, dc float32 = 0.0, 0.0
@@ -144,17 +115,17 @@ func (plc *PuplocCascade) classifyRegion(r, c, s float32, nrows, ncols int, pixe
for j := 0; j < int(plc.trees); j++ {
idx := 0
for k := 0; k < int(plc.treeDepth); k++ {
r1 := min(nrows-1, max(0, (256*int(r)+int(plc.treeCodes[root+4*idx+0])*int(round(float64(s))))>>8))
r2 := min(nrows-1, max(0, (256*int(r)+int(plc.treeCodes[root+4*idx+2])*int(round(float64(s))))>>8))
r1 := min(nrows-1, max(0, (256*int(r)+int(plc.treeCodes[root+4*idx+0])*int(math.Round(float64(s))))>>8))
r2 := min(nrows-1, max(0, (256*int(r)+int(plc.treeCodes[root+4*idx+2])*int(math.Round(float64(s))))>>8))
// flipV means that we wish to flip the column coordinates sign in the tree nodes.
// This is required at running the facial landmark detector over the right side of the detected face.
if flipV {
c1 = min(ncols-1, max(0, (256*int(c)+int(-plc.treeCodes[root+4*idx+1])*int(round(float64(s))))>>8))
c2 = min(ncols-1, max(0, (256*int(c)+int(-plc.treeCodes[root+4*idx+3])*int(round(float64(s))))>>8))
c1 = min(ncols-1, max(0, (256*int(c)+int(-plc.treeCodes[root+4*idx+1])*int(math.Round(float64(s))))>>8))
c2 = min(ncols-1, max(0, (256*int(c)+int(-plc.treeCodes[root+4*idx+3])*int(math.Round(float64(s))))>>8))
} else {
c1 = min(ncols-1, max(0, (256*int(c)+int(plc.treeCodes[root+4*idx+1])*int(round(float64(s))))>>8))
c2 = min(ncols-1, max(0, (256*int(c)+int(plc.treeCodes[root+4*idx+3])*int(round(float64(s))))>>8))
c1 = min(ncols-1, max(0, (256*int(c)+int(plc.treeCodes[root+4*idx+1])*int(math.Round(float64(s))))>>8))
c2 = min(ncols-1, max(0, (256*int(c)+int(plc.treeCodes[root+4*idx+3])*int(math.Round(float64(s))))>>8))
}
bintest := func(p1, p2 uint8) uint8 {
if p1 > p2 {
@@ -183,11 +154,11 @@ func (plc *PuplocCascade) classifyRegion(r, c, s float32, nrows, ncols int, pixe
}
// classifyRotatedRegion applies the face classification function over a rotated image.
func (plc *PuplocCascade) classifyRotatedRegion(r, c, s float32, a float64, nrows, ncols int, pixels []uint8, dim int, flipV bool) []float32 {
var row1, col1, row2, col2 int
root := 0
treeDepth := int(math.Pow(2, float64(plc.treeDepth)))
func (plc *PuplocCascade) classifyRotatedRegion(r, c, s float32, a float64, treeDepth, nrows, ncols int, pixels []uint8, dim int, flipV bool) []float32 {
var (
row1, col1, row2, col2 int
root int
)
qCosTable := []float32{256, 251, 236, 212, 181, 142, 97, 49, 0, -49, -97, -142, -181, -212, -236, -251, -256, -251, -236, -212, -181, -142, -97, -49, 0, 49, 97, 142, 181, 212, 236, 251, 256}
qSinTable := []float32{0, 49, 97, 142, 181, 212, 236, 251, 256, 251, 236, 212, 181, 142, 97, 49, 0, -49, -97, -142, -181, -212, -236, -251, -256, -251, -236, -212, -181, -142, -97, -49, 0}
@@ -245,10 +216,33 @@ func (plc *PuplocCascade) classifyRotatedRegion(r, c, s float32, a float64, nrow
return []float32{r, c, s}
}
// puplocPool is a struct for holding the pupil localization values in sync pool.
type puplocPool struct {
rows []float32
cols []float32
scale []float32
}
// Create a sync.Pool for further reusing the allocated memory space
// in order to keep the GC overhead as low as possible.
var plcDetPool = sync.Pool{
New: func() interface{} {
return &puplocPool{
rows: make([]float32, 63),
cols: make([]float32, 63),
scale: make([]float32, 63),
}
},
}
// RunDetector runs the pupil localization function.
func (plc *PuplocCascade) RunDetector(pl Puploc, img ImageParams, angle float64, flipV bool) *Puploc {
rows, cols, scale := []float32{}, []float32{}, []float32{}
res := []float32{}
var res = make([]float32, 3)
det := plcDetPool.Get().(*puplocPool)
defer plcDetPool.Put(det)
treeDepth := int(pow(2, int(plc.treeDepth)))
for i := 0; i < pl.Perturbs; i++ {
row := float32(pl.Row) + float32(pl.Scale)*0.15*(0.5-rand.Float32())
@@ -259,26 +253,26 @@ func (plc *PuplocCascade) RunDetector(pl Puploc, img ImageParams, angle float64,
if angle > 1.0 {
angle = 1.0
}
res = plc.classifyRotatedRegion(row, col, sc, angle, img.Rows, img.Cols, img.Pixels, img.Dim, flipV)
res = plc.classifyRotatedRegion(row, col, sc, angle, treeDepth, img.Rows, img.Cols, img.Pixels, img.Dim, flipV)
} else {
res = plc.classifyRegion(row, col, sc, img.Rows, img.Cols, img.Pixels, img.Dim, flipV)
res = plc.classifyRegion(row, col, sc, treeDepth, img.Rows, img.Cols, img.Pixels, img.Dim, flipV)
}
rows = append(rows, res[0])
cols = append(cols, res[1])
scale = append(scale, res[2])
det.rows[i] = res[0]
det.cols[i] = res[1]
det.scale[i] = res[2]
}
// Sorting the perturbations in ascendent order
sort.Sort(plocSort(rows))
sort.Sort(plocSort(cols))
sort.Sort(plocSort(scale))
sort.Sort(plocSort(det.rows))
sort.Sort(plocSort(det.cols))
sort.Sort(plocSort(det.scale))
// Get the median value of the sorted perturbation results
return &Puploc{
Row: int(rows[int(round(float64(pl.Perturbs)/2))]),
Col: int(cols[int(round(float64(pl.Perturbs)/2))]),
Scale: scale[int(round(float64(pl.Perturbs)/2))],
Row: int(det.rows[int(math.Round(float64(pl.Perturbs)/2))]),
Col: int(det.cols[int(math.Round(float64(pl.Perturbs)/2))]),
Scale: det.scale[int(math.Round(float64(pl.Perturbs)/2))],
}
}

View File

@@ -1,6 +1,8 @@
package pigo
import "math"
import (
"math"
)
// abs returns the absolute value of the provided number
func abs(x int) int {
@@ -34,3 +36,17 @@ func round(x float64) float64 {
}
return t
}
// pow is a fast multiply operator meant to replace the built-in math.Pow function
// for better performance, where the speed is much important than correctness.
func pow(base float64, exp int) float64 {
result := 1.0
for exp > 0 {
if exp%2 == 1 {
result *= base
}
exp >>= 1
base *= base
}
return result
}