Merge pull request #43 from swdee/reid

Re-Identification and Batch processing
2025-12-24 10:30:56 +08:00 · 2025-07-07 16:39:51 +12:00
parent bcfd158c28 d5fca6e6a1
commit 94259d8b8d
20 changed files with 2380 additions and 18 deletions
--- a/README.md
+++ b/README.md
@@ -75,6 +75,7 @@ See the [example](example) directory.
 * Image Classification
  * [MobileNet Demo](example/mobilenet)
  * [Pooled Runtime Usage](example/pool)
+  * [Batch Input Usage](example/batch)
 * Object Detection
  * [YOLOv5 Demo](example/yolov5)  
  * [YOLOv8 Demo](example/yolov8)
@@ -97,6 +98,8 @@ See the [example](example) directory.
  * [PPOCR Detect](example/ppocr#ppocr-detect) - Takes an image and detects areas of text.
  * [PPOCR Recognise](example/ppocr#ppocr-recognise) - Takes an area of text and performs OCR on it.
  * [PPOCR System](example/ppocr#ppocr-system) - Combines both Detect and Recognise.
+* Tracking
+  * [Re-Identification Demo](example/reid) - Re-Identify (ReID) similar objects for tracking, uses batch processing. 
 * Streaming
  * [HTTP Stream with ByteTrack Tracking](example/stream) - Demo that streams a video over HTTP with YOLO object detection and ByteTrack object tracking.  
 * Slicing Aided Hyper Inference
@@ -164,6 +167,24 @@ If you use `rknnlite.NewRuntimeByPlatform()` instead this will be automatically
 set for you.


+## Runtime Inference
+
+Once a Runtime has been created inference is performed by passing the input 
+tensors.
+
+```
+rt.Inference([]gocv.Mat{})
+```
+
+The `Inference()` function takes a slice of gocv.Mat's where the number of
+elements in the slice corresponds to the total number of input tensors the
+Model has.  Typically most models only have a single input tensor so only a single
+gocv.Mat would be passed here.
+
+If you want to pass multiple images in a single `Inference()` call, then you need
+to use [Batching](example/batch).
+
+
 ## CPU Affinity

 The performance of the NPU is effected by which CPU cores your program runs on, so
--- a/batch.go
+++ b/batch.go
@@ -0,0 +1,188 @@
+package rknnlite
+
+import (
+	"fmt"
+	"gocv.io/x/gocv"
+)
+
+// Batch defines a struct used for concatenating a batch of gocv.Mat's
+// together into a single gocv.Mat for use with image batching on
+// a Model
+type Batch struct {
+	mat gocv.Mat
+	// size of the batch
+	size int
+	// width is the input tensor size width
+	width int
+	// height is the input tensor size height
+	height int
+	// channels is the input tensor number of channels
+	channels int
+	// inputTypeFloat32 sets the runtime.inputTypeFloat32 value
+	inputTypeFloat32 bool
+	// matType is the Mat type images must be passed as
+	matType gocv.MatType
+	// matCnt is a counter for how many Mats have been added with Add()
+	matCnt int
+	// imgSize stores an images size made up from its elements
+	imgSize int
+}
+
+// NewBatch creates a batch of concatenated Mats for the given input tensor
+// and batch size
+func NewBatch(batchSize, height, width, channels int, inputTypeFloat32 bool) *Batch {
+
+	// Choose output Mat type
+	var matType gocv.MatType
+
+	if inputTypeFloat32 {
+		matType = gocv.MatTypeCV32F
+	} else {
+		matType = gocv.MatTypeCV8U
+	}
+
+	shape := []int{batchSize, height, width, channels}
+
+	return &Batch{
+		size:             batchSize,
+		height:           height,
+		width:            width,
+		channels:         channels,
+		mat:              gocv.NewMatWithSizes(shape, matType),
+		inputTypeFloat32: inputTypeFloat32,
+		matType:          matType,
+		matCnt:           0,
+		imgSize:          height * width * channels,
+	}
+}
+
+// Add a Mat to the batch
+func (b *Batch) Add(img gocv.Mat) error {
+
+	// check if batch is full
+	if b.matCnt >= b.size {
+		return fmt.Errorf("batch full")
+	}
+
+	res := b.addAt(b.matCnt, img)
+
+	if res != nil {
+		return res
+	}
+
+	// increment image counter
+	b.matCnt++
+	return nil
+}
+
+// AddAt adds a Mat to the batch at the specific index location
+func (b *Batch) AddAt(idx int, img gocv.Mat) error {
+
+	if idx < 0 || idx >= b.size {
+		return fmt.Errorf("index %d out of range [0-%d)", idx, b.size)
+	}
+
+	return b.addAt(idx, img)
+}
+
+// addAt adds a Mat to the specified index location
+func (b *Batch) addAt(idx int, img gocv.Mat) error {
+
+	// validate mat dimensions
+	if img.Rows() != b.height || img.Cols() != b.width ||
+		img.Channels() != b.channels {
+		return fmt.Errorf("image does not match batch shape")
+	}
+
+	if !img.IsContinuous() {
+		img = img.Clone()
+	}
+
+	if b.inputTypeFloat32 {
+		// pointer of the batch mat
+		dstAll, err := b.mat.DataPtrFloat32()
+
+		if err != nil {
+			return fmt.Errorf("error accessing float32 batch memory: %w", err)
+		}
+
+		src, err := img.DataPtrFloat32()
+
+		if err != nil {
+			return fmt.Errorf("error getting float32 data from image: %w", err)
+		}
+
+		offset := idx * b.imgSize
+		copy(dstAll[offset:], src)
+
+	} else {
+		// pointer of the batch mat
+		dstAll, err := b.mat.DataPtrUint8()
+
+		if err != nil {
+			return fmt.Errorf("error accessing uint8 batch memory: %w", err)
+		}
+
+		src, err := img.DataPtrUint8()
+
+		if err != nil {
+			return fmt.Errorf("error getting uint8 data from image: %w", err)
+		}
+
+		offset := idx * b.imgSize
+		copy(dstAll[offset:], src)
+	}
+
+	return nil
+}
+
+// GetOutputInt returns the tensor output for the specified image number
+// as an int8 output. idx starts counting from 1 to (batchsize-1)
+func (b *Batch) GetOutputInt(idx int, outputs Output, size int) ([]int8, error) {
+
+	if idx < 0 || idx >= b.size {
+		return nil, fmt.Errorf("index %d out of range [0-%d)", idx, b.size)
+	}
+
+	offset := idx * size
+
+	if offset+size > int(outputs.Size) {
+		return nil, fmt.Errorf("offset %d out of range [%d,%d)", offset, outputs.Size, offset+size)
+	}
+
+	return outputs.BufInt[offset : offset+size], nil
+}
+
+// GetOutputF32 returns the tensor output for the specified image number
+// as an float32 output.  idx starts counting from 0 to (batchsize-1)
+func (b *Batch) GetOutputF32(idx int, outputs Output, size int) ([]float32, error) {
+
+	if idx < 0 || idx >= b.size {
+		return nil, fmt.Errorf("index %d out of range [0-%d)", idx, b.size)
+	}
+
+	offset := idx * size
+
+	if offset+size > int(outputs.Size) {
+		return nil, fmt.Errorf("offset %d out of range [%d,%d)", offset, outputs.Size, offset+size)
+	}
+
+	return outputs.BufFloat[offset : offset+size], nil
+}
+
+// Mat returns the concatenated mat
+func (b *Batch) Mat() gocv.Mat {
+	return b.mat
+}
+
+// Clear the batch so it can be reused again
+func (b *Batch) Clear() {
+	// just reset the counter, we don't need to clear the underlying b.mat
+	// as it will be overwritten with Add() is called with new images
+	b.matCnt = 0
+}
+
+// Close the batch and free allocated memory
+func (b *Batch) Close() error {
+	return b.mat.Close()
+}
--- a/batch_test.go
+++ b/batch_test.go
@@ -0,0 +1,329 @@
+package rknnlite
+
+import (
+	"errors"
+	"flag"
+	"fmt"
+	"gocv.io/x/gocv"
+	"path/filepath"
+	"regexp"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+)
+
+var modelFiles = flag.String("m", "osnet_x1_0_market_256x128-rk3588-batch{1,4,8,16}.rknn",
+	"RKNN compiled model files in format <name>-batch{N1,N2,...,Nk}.rknn")
+var rkPlatform = flag.String("p", "rk3588",
+	"Rockchip CPU Model number [rk3562|rk3566|rk3568|rk3576|rk3582|rk3588]")
+
+// ExpandModelPattern takes a pattern like
+//
+//	"/some/dir/osnet_x1_0_market_256x128-rk3588-batch{1,4,8,16}.rknn"
+//
+// and returns:
+//
+//	[]string{
+//	  "/some/dir/osnet_x1_0_market_256x128-rk3588-batch1.rknn",
+//	  "/some/dir/osnet_x1_0_market_256x128-rk3588-batch4.rknn",
+//	  "/some/dir/osnet_x1_0_market_256x128-rk3588-batch8.rknn",
+//	  "/some/dir/osnet_x1_0_market_256x128-rk3588-batch16.rknn",
+//	}
+func expandModelPattern(pattern string) ([]modelBatches, error) {
+
+	// split off the directory and file
+	dir, file := filepath.Split(pattern)
+
+	// match exactly "<prefix>-batch{n1,n2,...}.rknn"
+	re := regexp.MustCompile(`^(.+)-batch\{([\d,]+)\}\.rknn$`)
+	m := re.FindStringSubmatch(file)
+
+	if m == nil {
+		return nil, errors.New("invalid pattern: must be name-batch{n1,n2,...}.rknn")
+	}
+
+	prefix := m[1]  // e.g. "osnet_x1_0_market_256x128-rk3588"
+	numsCSV := m[2] // e.g. "1,4,8,16"
+	nums := strings.Split(numsCSV, ",")
+	out := make([]modelBatches, 0, len(nums))
+
+	for _, strNum := range nums {
+
+		num, err := strconv.Atoi(strNum)
+
+		if err != nil {
+			return nil, fmt.Errorf("invalid batch size %q: %w", strNum, err)
+		}
+
+		name := fmt.Sprintf("%s-batch%d.rknn", prefix, num)
+
+		out = append(out, modelBatches{
+			batchSize: num,
+			modelFile: filepath.Join(dir, name),
+		})
+	}
+
+	return out, nil
+}
+
+type modelBatches struct {
+	batchSize int
+	modelFile string
+}
+
+// BenchmarkBatchSize runs benchmarks against multiple models to work out per
+// image inference time.
+func BenchmarkBatchSize(b *testing.B) {
+
+	flag.Parse()
+
+	// from the modelFiles argument create a table of model files and corresponding
+	// batch sizes
+	cases, err := expandModelPattern(*modelFiles)
+
+	if err != nil {
+		b.Fatalf("Invalid modelFile syntax: %v", err)
+	}
+
+	const (
+		height   = 256
+		width    = 128
+		channels = 3
+	)
+
+	for _, tc := range cases {
+		tc := tc // capture
+
+		b.Run(fmt.Sprintf("Batch%02d", tc.batchSize), func(b *testing.B) {
+
+			// load the RKNN model for this batch size
+			err := SetCPUAffinityByPlatform(*rkPlatform, FastCores)
+
+			if err != nil {
+				b.Fatalf("Failed to set CPU Affinity: %v", err)
+			}
+
+			// check if user specified model file or if default is being used.  if default
+			// then pick the default platform model to use.
+			modelFile := tc.modelFile
+
+			if *rkPlatform != "rk3588" {
+				modelFile = strings.ReplaceAll(modelFile, "rk3588", *rkPlatform)
+			}
+
+			// create rknn runtime instance
+			rt, err := NewRuntimeByPlatform(*rkPlatform, modelFile)
+
+			if err != nil {
+				b.Fatalf("Error initializing RKNN runtime: %v", err)
+			}
+
+			defer rt.Close()
+
+			// set runtime to leave output tensors as int8
+			rt.SetWantFloat(false)
+
+			// prepare zero images
+			imgs := make([]gocv.Mat, tc.batchSize)
+
+			for i := range imgs {
+				m := gocv.Zeros(height, width, gocv.MatTypeCV8UC3)
+				defer m.Close()
+				imgs[i] = m
+			}
+
+			// pre-allocate the batch container
+			batch := NewBatch(tc.batchSize, height, width, channels, rt.inputTypeFloat32)
+			defer batch.Close()
+
+			b.ResetTimer()
+			var totalInf time.Duration
+
+			for i := 0; i < b.N; i++ {
+				batch.Clear()
+				start := time.Now()
+
+				for _, img := range imgs {
+					if err := batch.Add(img); err != nil {
+						b.Fatalf("Add() error: %v", err)
+					}
+				}
+
+				if _, err := rt.Inference([]gocv.Mat{batch.Mat()}); err != nil {
+					b.Fatalf("Inference() error: %v", err)
+				}
+
+				totalInf += time.Since(start)
+			}
+
+			b.StopTimer()
+
+			// milliseconds per batch
+			msBatch := float64(totalInf.Nanoseconds()) / 1e6 / float64(b.N)
+			b.ReportMetric(msBatch, "ms/batch")
+
+			// milliseconds per image
+			msImg := msBatch / float64(tc.batchSize)
+			b.ReportMetric(msImg, "ms/img")
+
+		})
+	}
+}
+
+func TestBatchAddAndOverflow(t *testing.T) {
+
+	r := &Runtime{inputTypeFloat32: false}
+
+	batch := NewBatch(2, 2, 3, 1, r.inputTypeFloat32)
+	defer batch.Close()
+
+	// create Mats with known data
+	m1 := gocv.NewMatWithSize(2, 3, gocv.MatTypeCV8U)
+	defer m1.Close()
+
+	buf1, _ := m1.DataPtrUint8()
+
+	for i := range buf1 {
+		buf1[i] = uint8(i + 1) // 1,2,3...6
+	}
+
+	m2 := gocv.NewMatWithSize(2, 3, gocv.MatTypeCV8U)
+	defer m2.Close()
+
+	buf2, _ := m2.DataPtrUint8()
+
+	for i := range buf2 {
+		buf2[i] = uint8((i + 1) * 10) // 10,20,...60
+	}
+
+	// Add two images
+	if err := batch.Add(m1); err != nil {
+		t.Fatalf("Add(m1) failed: %v", err)
+	}
+
+	if err := batch.Add(m2); err != nil {
+		t.Fatalf("Add(m2) failed: %v", err)
+	}
+
+	// Underlying batch mat should contain both
+	bMat := batch.Mat()
+	allData, err := bMat.DataPtrUint8()
+
+	if err != nil {
+		t.Fatalf("DataPtrUint8 on batch failed: %v", err)
+	}
+
+	// first 6 from buf1, next 6 from buf2
+	for i := 0; i < 6; i++ {
+		if allData[i] != buf1[i] {
+			t.Errorf("element %d = %d; want %d from img1", i, allData[i], buf1[i])
+		}
+	}
+
+	for i := 0; i < 6; i++ {
+		if allData[6+i] != buf2[i] {
+			t.Errorf("element %d = %d; want %d from img2", 6+i, allData[6+i], buf2[i])
+		}
+	}
+
+	// third Add should overflow
+	m3 := gocv.NewMatWithSize(2, 3, gocv.MatTypeCV8U)
+	err3 := batch.Add(m3)
+
+	if err3 == nil {
+		t.Fatal("expected overflow error on third Add, got nil")
+	}
+}
+
+func TestBatchAddAtAndClear(t *testing.T) {
+
+	r := &Runtime{inputTypeFloat32: false}
+
+	batch := NewBatch(3, 2, 2, 1, r.inputTypeFloat32)
+	defer batch.Close()
+
+	m := gocv.NewMatWithSize(2, 2, gocv.MatTypeCV8U)
+	defer m.Close()
+
+	dat, _ := m.DataPtrUint8()
+
+	for i := range dat {
+		dat[i] = uint8(i + 5)
+	}
+
+	// AddAt index 1
+	if err := batch.AddAt(1, m); err != nil {
+		t.Fatalf("AddAt failed: %v", err)
+	}
+
+	// matCnt should still be zero
+	if batch.matCnt != 0 {
+		t.Errorf("matCnt = %d; want 0 after AddAt", batch.matCnt)
+	}
+
+	// Clear resets matCnt
+	batch.Clear()
+
+	if batch.matCnt != 0 {
+		t.Errorf("matCnt = %d; want 0 after Clear", batch.matCnt)
+	}
+
+	// Add at invalid index
+	err := batch.AddAt(5, m)
+
+	if err == nil {
+		t.Error("expected error for AddAt out of range, got nil")
+	}
+}
+
+func TestGetOutputIntAndF32(t *testing.T) {
+
+	r := &Runtime{inputTypeFloat32: false}
+
+	batch := NewBatch(2, 2, 2, 1, r.inputTypeFloat32)
+	defer batch.Close()
+
+	// Test GetOutputInt bounds
+	dOut := Output{BufInt: []int8{1, 2, 3, 4}, Size: 4}
+
+	if _, err := batch.GetOutputInt(-1, dOut, 2); err == nil {
+		t.Error("expected error for GetOutputInt idx<0")
+	}
+
+	if _, err := batch.GetOutputInt(2, dOut, 2); err == nil {
+		t.Error("expected error for GetOutputInt idx>=size")
+	}
+
+	// valid slice
+	slice, err := batch.GetOutputInt(1, dOut, 2)
+
+	if err != nil {
+		t.Errorf("GetOutputInt failed: %v", err)
+	}
+
+	if len(slice) != 2 {
+		t.Errorf("len(slice) = %d; want 2", len(slice))
+	}
+
+	// Test GetOutputF32 bounds
+	dOutF := Output{BufFloat: []float32{1, 2, 3, 4}, Size: 4}
+
+	if _, err := batch.GetOutputF32(-1, dOutF, 2); err == nil {
+		t.Error("expected error for GetOutputF32 idx<0")
+	}
+
+	if _, err := batch.GetOutputF32(2, dOutF, 2); err == nil {
+		t.Error("expected error for GetOutputF32 idx>=size")
+	}
+
+	sliceF, err := batch.GetOutputF32(0, dOutF, 2)
+
+	if err != nil {
+		t.Errorf("GetOutputF32 failed: %v", err)
+	}
+
+	if len(sliceF) != 2 {
+		t.Errorf("len(sliceF) = %d; want 2", len(sliceF))
+	}
+}
--- a/batchpool.go
+++ b/batchpool.go
@@ -0,0 +1,75 @@
+package rknnlite
+
+import (
+	"sync"
+)
+
+// BatchPool is a pool of batches
+type BatchPool struct {
+	// pool of batches
+	batches chan *Batch
+	// size of pool
+	size  int
+	close sync.Once
+}
+
+// NewBatchPool returns a pool of Batches
+func NewBatchPool(size int, rt *Runtime) *BatchPool {
+
+	p := &BatchPool{
+		batches: make(chan *Batch, size),
+		size:    size,
+	}
+
+	batchSize := int(rt.InputAttrs()[0].Dims[0])
+	width := int(rt.InputAttrs()[0].Dims[1])
+	height := int(rt.InputAttrs()[0].Dims[2])
+	channels := int(rt.InputAttrs()[0].Dims[3])
+	inputType := rt.GetInputTypeFloat32()
+
+	// create batch pool to be the same size as the runtime pool
+	for i := 0; i < size; i++ {
+		batch := NewBatch(
+			batchSize,
+			height,
+			width,
+			channels,
+			inputType,
+		)
+
+		// attach to pool
+		p.Return(batch)
+	}
+
+	return p
+}
+
+// Gets a batch from the pool
+func (p *BatchPool) Get() *Batch {
+	return <-p.batches
+}
+
+// Return a batch to the pool
+func (p *BatchPool) Return(batch *Batch) {
+
+	batch.Clear()
+
+	select {
+	case p.batches <- batch:
+	default:
+		// pool is full or closed
+	}
+}
+
+// Close the pool and all batches in it
+func (p *BatchPool) Close() {
+	p.close.Do(func() {
+		// close channel
+		close(p.batches)
+
+		// close all runtimes
+		for next := range p.batches {
+			_ = next.Close()
+		}
+	})
+}
--- a/example/batch/README.md
+++ b/example/batch/README.md
@@ -0,0 +1,230 @@
+
+# Batch Models
+
+## Overview
+
+Typically computer vision inference models have a single input tensor in 
+the shape of `NHWC` such as `[1,224,224,3]`.  The rknn-toolkit2 allows you to 
+build the model with Batch tensor inputs by setting the `rknn_batch_size` parameter 
+in the following python conversion script.
+
+```
+rknn.build(do_quantization=do_quant, dataset=DATASET_PATH, rknn_batch_size=8)
+```
+
+This results in a .rknn model with modified tensor input dimensions of `[8,224,244,3]`.
+
+When taking input from a video source frame-by-frame, the use of batching to process
+frames has little use case, as your only dealing with a single frame to be
+processed as soon as possible.   However batching can be useful if you have many
+images to process at a single point in time, some examples of this could be;
+ * Running YOLO object detection on a frame, then passing all detected objects 
+   through a ReIdentification model in batches.
+ * Some applications will buffer video frames and upon an external signal, it
+   will then trigger the processing of those buffered frames as a batch.
+
+
+## Batch Sizing
+
+The NPU's in the different platforms RK356x, RK3576, and RK3588 have different
+amounts of SRAM and NPU core numbers, so finding the optimal batch size for your
+Model is critical.
+
+A benchmarking tool has been created to test different batch sizes of your own
+RKNN Models.  Use your python conversion script to compile the ONNX model to RKNN
+with various `rknn_batch_size` values you would like to test.  Name those RKNN
+Models using this format `<name>-batch{N1,N2,...,Nk}.rknn`. For example I wish
+to test batch sizes of 1, 4, 8, and 16 of an OSNet model and have created the
+following files and placed them in the directory `/tmp/models` on the host OS.
+```
+osnet-batch1.rknn
+osnet-batch4.rknn
+osnet-batch8.rknn
+osnet-batch16.rknn
+```
+
+We can then pass all these Models to the benchmark using the `-m` argument in 
+the format of `-m "/tmp/models/osnet-batch{1,4,8,16}"`.  
+
+To run the benchmark of your models on the rk3588 or replace with your 
+Platform model.
+```
+# from project root directory
+
+go test -bench=BenchmarkBatchSize -benchtime=10s \
+  -args -p rk3588 -m "/tmp/models/osnet-batch{1,4,8,16}.rknn"
+```
+
+Similarly using Docker we can mount the `/tmp/models` directory and run.
+```
+# from project root directory
+
+docker run --rm \
+  --device /dev/dri:/dev/dri \
+  -v "$(pwd):/go/src/app" \
+  -v "$(pwd)/example/data:/go/src/data" \
+  -v "/usr/include/rknn_api.h:/usr/include/rknn_api.h" \
+  -v "/usr/lib/librknnrt.so:/usr/lib/librknnrt.so" \
+  -v "/tmp/models/:/tmp/models/" \
+  -w /go/src/app \
+  swdee/go-rknnlite:latest \
+  go test -bench=BenchmarkBatchSize -benchtime=10s \ 
+    -args -p rk3588 -m "/tmp/models/osnet-batch{1,4,8,16}"
+```
+
+Running the above benchmark command outputs the following results.
+
+#### rk3588
+
+```
+BenchmarkBatchSize/Batch01-8                1897           8806025 ns/op                 8.806 ms/batch          8.806 ms/img
+BenchmarkBatchSize/Batch04-8                 885          21555109 ns/op                21.55 ms/batch           5.389 ms/img
+BenchmarkBatchSize/Batch08-8                 534          22335645 ns/op                22.34 ms/batch           2.792 ms/img
+BenchmarkBatchSize/Batch16-8                 303          40253162 ns/op                40.25 ms/batch           2.516 ms/img
+```
+
+#### rk3576
+
+```
+BenchmarkBatchSize/Batch01-8                1312           8987117 ns/op                 8.985 ms/batch          8.985 ms/img
+BenchmarkBatchSize/Batch04-8                 640          18836090 ns/op                18.83 ms/batch           4.709 ms/img
+BenchmarkBatchSize/Batch08-8                 385          31702649 ns/op                31.70 ms/batch           3.963 ms/img
+BenchmarkBatchSize/Batch16-8                 194          63801596 ns/op                63.80 ms/batch           3.988 ms/img
+```
+
+#### rk3566
+
+```
+BenchmarkBatchSize/Batch01-4                 661          18658568 ns/op                18.66 ms/batch          18.66 ms/img
+BenchmarkBatchSize/Batch04-4                 158          74716574 ns/op                74.71 ms/batch          18.68 ms/img
+BenchmarkBatchSize/Batch08-4                  70         155374027 ns/op               155.4 ms/batch           19.42 ms/img
+BenchmarkBatchSize/Batch16-4                  37         294969497 ns/op               295.0 ms/batch           18.44 ms/img
+```
+
+
+### Interpreting Benchmark Results
+
+
+The `ms/batch` metric represents the number of milliseconds it took for the 
+whole batch inference to run and `ms/img` represents the average number of 
+milliseconds it took to run inference per image.
+
+As can be seen in the rk3588 results the ideal batch size is 8 as it gives 
+a low `2.792` ms/img inference time versus total batch inference time of 
+`22.34ms`.  The same applies to the rk3576.
+
+The rk3566 has a single core NPU, the results show there is no benefit 
+in running batching at all.
+
+These results were for an OSNet Model, it's possible that different Models perform 
+differently so you should run these benchmarks for your own application to 
+optimize accordingly.
+
+
+## Usage
+
+An example batch program is provided that combines inferencing on a Pool of runtimes,
+make sure you have downloaded the data files first for the examples.
+You only need to do this once for all examples.
+
+
+```
+cd example/
+git clone --depth=1 https://github.com/swdee/go-rknnlite-data.git data
+```
+
+
+Run the batch example on rk3588 or replace with your Platform model.
+```
+cd example/batch
+go run batch.go -s 3 -p rk3588
+```
+
+This will result in the output of:
+```
+Driver Version: 0.9.6, API Version: 2.3.0 (c949ad889d@2024-11-07T11:35:33)
+Model Input Number: 1, Ouput Number: 1
+Input tensors:
+  index=0, name=input, n_dims=4, dims=[8, 224, 224, 3], n_elems=1204224, size=1204224, fmt=NHWC, type=INT8, qnt_type=AFFINE, zp=-14, scale=0.018658
+Output tensors:
+  index=0, name=output, n_dims=2, dims=[8, 1000, 0, 0], n_elems=8000, size=8000, fmt=UNDEFINED, type=INT8, qnt_type=AFFINE, zp=-55, scale=0.141923
+Running...
+File ../data/imagenet/n01514859_hen.JPEG, inference time 40ms
+File ../data/imagenet/n01518878_ostrich.JPEG, inference time 40ms
+File ../data/imagenet/n01530575_brambling.JPEG, inference time 40ms
+File ../data/imagenet/n01531178_goldfinch.JPEG, inference time 40ms
+...snip...
+File ../data/imagenet/n13054560_bolete.JPEG, inference time 8ms
+File ../data/imagenet/n13133613_ear.JPEG, inference time 8ms
+File ../data/imagenet/n15075141_toilet_tissue.JPEG, inference time 8ms
+Processed 1000 images in 2.098619346s, average inference per image is 2.10ms
+```
+
+See the help for command line parameters.
+```
+$ go run batch.go -h
+
+Usage of /tmp/go-build1506342544/b001/exe/batch:
+  -d string
+        A directory of images to run inference on (default "../data/imagenet/")
+  -m string
+        RKNN compiled model file (default "../data/models/rk3588/mobilenetv2-batch8-rk3588.rknn")
+  -p string
+        Rockchip CPU Model number [rk3562|rk3566|rk3568|rk3576|rk3582|rk3582|rk3588] (default "rk3588")
+  -q    Run in quiet mode, don't display individual inference results
+  -r int
+        Repeat processing image directory the specified number of times, use this if you don't have enough images (default 1)
+  -s int
+        Size of RKNN runtime pool, choose 1, 2, 3, or multiples of 3 (default 1)
+```
+
+
+
+### Docker
+
+To run the batch example using the prebuilt docker image, make sure the data files have been downloaded first,
+then run.
+```
+# from project root directory
+
+docker run --rm \
+  --device /dev/dri:/dev/dri \
+  -v "$(pwd):/go/src/app" \
+  -v "$(pwd)/example/data:/go/src/data" \
+  -v "/usr/include/rknn_api.h:/usr/include/rknn_api.h" \
+  -v "/usr/lib/librknnrt.so:/usr/lib/librknnrt.so" \
+  -w /go/src/app \
+  swdee/go-rknnlite:latest \
+  go run ./example/batch/batch.go -p rk3588 -s 3
+```
+
+
+## API
+
+A convenience function `rknnlite.NewBatch()` is provided to concatenate individual
+images into a single input tensor for the Model and then extract their results 
+from the combined outputs.
+
+```
+// create a new batch processor
+batch := rt.NewBatch(batchSize, height, width, channels)
+defer batch.Close()
+
+
+for idx, file := range files {
+
+    // add files to the batch at the given index
+    batch.AddAt(idx, file)
+    
+    // OR you can add images incrementally without specifying an index
+    batch.Add(file)
+}
+
+// pass the concatenated Mat to the runtime for inference
+outputs, err := rt.Inference([]gocv.Mat{batch.Mat()})
+
+// then get a single image result by index
+output, err := batch.GetOutputInt(4, outputs.Output[0], int(outputs.OutputAttributes().DimForDFL))
+```
+
+See the full example code for more details.
--- a/example/batch/batch.go
+++ b/example/batch/batch.go
@@ -0,0 +1,222 @@
+package main
+
+import (
+	"flag"
+	"github.com/swdee/go-rknnlite"
+	"gocv.io/x/gocv"
+	"image"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+)
+
+var (
+	// model input tensor dimensions, these values will be set
+	// when runtime queries the modelFile being loaded
+	height, width, channels, batchSize int
+)
+
+func main() {
+	// disable logging timestamps
+	log.SetFlags(0)
+
+	// read in cli flags
+	modelFile := flag.String("m", "../data/models/rk3588/mobilenetv2-batch8-rk3588.rknn", "RKNN compiled model file")
+	imgDir := flag.String("d", "../data/imagenet/", "A directory of images to run inference on")
+	poolSize := flag.Int("s", 1, "Size of RKNN runtime pool, choose 1, 2, 3, or multiples of 3")
+	repeat := flag.Int("r", 1, "Repeat processing image directory the specified number of times, use this if you don't have enough images")
+	quiet := flag.Bool("q", false, "Run in quiet mode, don't display individual inference results")
+	rkPlatform := flag.String("p", "rk3588", "Rockchip CPU Model number [rk3562|rk3566|rk3568|rk3576|rk3582|rk3582|rk3588]")
+
+	flag.Parse()
+
+	// set cpu affinity to run on specific CPU cores
+	err := rknnlite.SetCPUAffinityByPlatform(*rkPlatform, rknnlite.FastCores)
+
+	if err != nil {
+		log.Printf("Failed to set CPU Affinity: %v\n", err)
+	}
+
+	// check dir exists
+	info, err := os.Stat(*imgDir)
+
+	if err != nil {
+		log.Fatalf("No such image directory %s, error: %v\n", *imgDir, err)
+	}
+
+	if !info.IsDir() {
+		log.Fatal("Image path is not a directory")
+	}
+
+	// check if user specified model file or if default is being used.  if default
+	// then pick the default platform model to use.
+	if f := flag.Lookup("m"); f != nil && f.Value.String() == f.DefValue && *rkPlatform != "rk3588" {
+		*modelFile = strings.ReplaceAll(*modelFile, "rk3588", *rkPlatform)
+	}
+
+	// create new pool, we pass NPUCoreAuto as RKNN does not allow batch Models
+	// to be pinned to specific NPU cores
+	useCore := rknnlite.NPUCoreAuto
+
+	if strings.HasPrefix(strings.ToLower(*rkPlatform), "rk356") {
+		useCore = rknnlite.NPUSkipSetCore
+	}
+
+	pool, err := rknnlite.NewPool(*poolSize, *modelFile,
+		[]rknnlite.CoreMask{useCore})
+
+	if err != nil {
+		log.Fatalf("Error creating RKNN pool: %v\n", err)
+	}
+
+	// set runtime to leave output tensors as int8
+	pool.SetWantFloat(false)
+
+	// get a runtime and query the input tensor dimensions of the model
+	rt := pool.Get()
+
+	// optional querying of model file tensors and SDK version for printing
+	// to stdout.  not necessary for production inference code
+	err = rt.Query(os.Stdout)
+
+	if err != nil {
+		log.Fatal("Error querying runtime: ", err)
+	}
+
+	batchSize = int(rt.InputAttrs()[0].Dims[0])
+	width = int(rt.InputAttrs()[0].Dims[1])
+	height = int(rt.InputAttrs()[0].Dims[2])
+	channels = int(rt.InputAttrs()[0].Dims[3])
+
+	pool.Return(rt)
+
+	// get list of all files in the directory
+	entries, err := os.ReadDir(*imgDir)
+
+	if err != nil {
+		log.Fatalf("Error reading image directory: %v\n", err)
+	}
+
+	var files []string
+
+	for _, e := range entries {
+		if e.IsDir() {
+			continue
+		}
+
+		files = append(files, filepath.Join(*imgDir, e.Name()))
+	}
+
+	log.Println("Running...")
+
+	// waitgroup used to wait for all go-routines to complete before closing
+	// the pool
+	const batchSize = 8
+	var wg sync.WaitGroup
+
+	start := time.Now()
+
+	// repeat processing image set the specified number of times
+	for i := 0; i < *repeat; i++ {
+		// process image files in groups of batchSize
+		for offset := 0; offset < len(files); offset += batchSize {
+
+			end := offset + batchSize
+
+			if end > len(files) {
+				end = len(files)
+			}
+
+			subset := files[offset:end]
+
+			// pool.Get() blocks if no runtimes are available in the pool
+			rt := pool.Get()
+			wg.Add(1)
+
+			go func(rt *rknnlite.Runtime, batchPaths []string) {
+				defer wg.Done()
+				processBatch(rt, batchPaths, *quiet)
+				pool.Return(rt)
+			}(rt, subset)
+		}
+	}
+
+	wg.Wait()
+
+	// calculate average inference
+	numFiles := (*repeat * len(files))
+	end := time.Since(start)
+	avg := (end.Seconds() / float64(numFiles)) * 1000
+
+	log.Printf("Processed %d images in %s, average inference per image is %.2fms\n",
+		numFiles, end.String(), avg)
+
+	pool.Close()
+}
+
+func processBatch(rt *rknnlite.Runtime, paths []string, quiet bool) {
+
+	// create batch
+	batch := rknnlite.NewBatch(batchSize, height, width, channels,
+		rt.GetInputTypeFloat32())
+	defer batch.Close()
+
+	// for each image path, load & preprocess, then Add to batch
+	for idx, file := range paths {
+
+		img := gocv.IMRead(file, gocv.IMReadColor)
+
+		if img.Empty() {
+			log.Printf("Error reading %s\n", file)
+			continue
+		}
+
+		defer img.Close()
+
+		// rgb + resize
+		rgbImg := gocv.NewMat()
+		gocv.CvtColor(img, &rgbImg, gocv.ColorBGRToRGB)
+		defer rgbImg.Close()
+
+		cropImg := gocv.NewMat()
+		gocv.Resize(rgbImg, &cropImg, image.Pt(width, height), 0, 0, gocv.InterpolationArea)
+		defer cropImg.Close()
+
+		if err := batch.AddAt(idx, cropImg); err != nil {
+			log.Printf("Batch.Add error: %v\n", err)
+		}
+	}
+
+	// run inference on the entire batch at once
+	start := time.Now()
+	outputs, err := rt.Inference([]gocv.Mat{batch.Mat()})
+	spent := time.Since(start)
+
+	if err != nil {
+		log.Printf("Inference error: %v\n", err)
+		return
+	}
+
+	defer outputs.Free()
+
+	// unpack per image results
+	for idx := 0; idx < len(paths); idx++ {
+
+		if quiet {
+			continue
+		}
+
+		// get int8 output tensor for image at idx
+		_, err := batch.GetOutputInt(idx, outputs.Output[0], int(outputs.OutputAttributes().DimForDFL))
+
+		if err != nil {
+			log.Printf("GetOutputInt[%d] error: %v\n", idx, err)
+			continue
+		}
+
+		log.Printf("File %s, inference time %dms\n", paths[idx], spent.Milliseconds())
+	}
+}
--- a/example/reid/README.md
+++ b/example/reid/README.md
@@ -0,0 +1,188 @@
+
+# Re-Identification (ReID)
+
+## Overview
+
+Object trackers like ByteTrack can be used to track visible objects frame‐to‐frame, 
+but they rely on the assumption that an object's appearance and location change 
+smoothly over time. If a person goes behind a building or is briefly hidden
+by another passerby, the tracker can lose that objects identity. When that same 
+person reemerges, the tracker often treats them as a new object, assigning a new ID.
+This makes analyzing a persons complete path through a scene difficult
+or makes counting unique objects much harder.
+
+Re-Identification (ReID) models help solve this problem by using embedding features 
+which encode an object into a fixed length vector that captures distinctive
+patterns, shapes, or other visual signatures.  When an object disappears and 
+then reappears you can compare the newly detected objects embedding against a list of
+past objects. If the similarity (using Cosine or Euclidean distance) 
+exceeds a chosen threshold, you can confidently link the new detection back to the 
+original track ID.
+
+
+## Datasets
+
+The [OSNet model](https://paperswithcode.com/paper/omni-scale-feature-learning-for-person-re) is 
+lite weight and provides good accuracy for reidentification tasks, however
+it must be trained using a dataset to identify specific object classes.
+
+This example uses the [Market1501](https://paperswithcode.com/dataset/market-1501) 
+dataset trained for reidentifying people.
+
+To support other object classifications such as Vehicles, Faces, or Animals, you
+will need to source and train these accordingly.
+
+
+## Occlusion Example
+
+In the [people walking video](https://github.com/swdee/go-rknnlite-data/raw/master/people-walking.mp4) 
+a lady wearing a CK branded jacket starts 
+in the beginning of the scene and becomes occluded by passersby.  When she reappears Bytetrack
+detects them as a new person.
+
+![CK Lady](https://github.com/swdee/go-rknnlite-data/raw/master/docimg/reid-ck-lady-movement.jpg)
+
+
+
+## Usage
+
+Make sure you have downloaded the data files first for the examples.
+You only need to do this once for all examples.
+
+```
+cd example/
+git clone --depth=1 https://github.com/swdee/go-rknnlite-data.git data
+```
+
+
+Command line Usage.
+```
+$ go run reid.go -h
+
+Usage of /tmp/go-build147978858/b001/exe/reid:
+  -d string
+        Data file containing object co-ordinates (default "../data/reid-objects.dat")
+  -e float
+        The Euclidean distance [0.0-1.0], a value less than defines a match (default 0.51)
+  -i string
+        Image file to run inference on (default "../data/reid-walking.jpg")
+  -m string
+        RKNN compiled model file (default "../data/models/rk3588/osnet-market1501-batch8-rk3588.rknn")
+  -p string
+        Rockchip CPU Model number [rk3562|rk3566|rk3568|rk3576|rk3582|rk3582|rk3588] (default "rk3588")
+```
+
+Run the ReID example on rk3588 or replace with your Platform model.
+```
+cd example/reid/
+go run reid.go -p rk3588
+```
+
+
+This will result in the output of:
+```
+Driver Version: 0.9.6, API Version: 2.3.0 (c949ad889d@2024-11-07T11:35:33)
+Model Input Number: 1, Ouput Number: 1
+Input tensors:
+  index=0, name=input, n_dims=4, dims=[8, 256, 128, 3], n_elems=786432, size=786432, fmt=NHWC, type=INT8, qnt_type=AFFINE, zp=-14, scale=0.018658
+Output tensors:
+  index=0, name=output, n_dims=2, dims=[8, 512, 0, 0], n_elems=4096, size=4096, fmt=UNDEFINED, type=INT8, qnt_type=AFFINE, zp=-128, scale=0.018782
+Comparing object 0 at (0,0,134,361)
+  Object 0 at (0,0,134,361) has euclidean distance: 0.000000 (same person)
+  Object 1 at (134,0,251,325) has euclidean distance: 0.423271 (same person)
+  Object 2 at (251,0,326,208) has euclidean distance: 0.465061 (same person)
+  Object 3 at (326,0,394,187) has euclidean distance: 0.445583 (same person)
+Comparing object 1 at (394,0,513,357)
+  Object 0 at (0,0,134,361) has euclidean distance: 0.781510 (different person)
+  Object 1 at (134,0,251,325) has euclidean distance: 0.801649 (different person)
+  Object 2 at (251,0,326,208) has euclidean distance: 0.680299 (different person)
+  Object 3 at (326,0,394,187) has euclidean distance: 0.686542 (different person)
+Comparing object 2 at (513,0,588,246)
+  Object 0 at (0,0,134,361) has euclidean distance: 0.860921 (different person)
+  Object 1 at (134,0,251,325) has euclidean distance: 0.873663 (different person)
+  Object 2 at (251,0,326,208) has euclidean distance: 0.870753 (different person)
+  Object 3 at (326,0,394,187) has euclidean distance: 0.820761 (different person)
+Comparing object 3 at (588,0,728,360)
+  Object 0 at (0,0,134,361) has euclidean distance: 0.762738 (different person)
+  Object 1 at (134,0,251,325) has euclidean distance: 0.800668 (different person)
+  Object 2 at (251,0,326,208) has euclidean distance: 0.763694 (different person)
+  Object 3 at (326,0,394,187) has euclidean distance: 0.769597 (different person)
+Model first run speed: batch preparation=3.900093ms, inference=47.935686ms, post processing=262.203µs, total time=52.097982ms
+done
+```
+
+### Docker
+
+To run the ReID example using the prebuilt docker image, make sure the data files have been downloaded first,
+then run.
+```
+# from project root directory
+
+docker run --rm \
+  --device /dev/dri:/dev/dri \
+  -v "$(pwd):/go/src/app" \
+  -v "$(pwd)/example/data:/go/src/data" \
+  -v "/usr/include/rknn_api.h:/usr/include/rknn_api.h" \
+  -v "/usr/lib/librknnrt.so:/usr/lib/librknnrt.so" \
+  -w /go/src/app \
+  swdee/go-rknnlite:latest \
+  go run ./example/reid/reid.go -p rk3588
+```
+
+### Interpreting Results
+
+The above example uses people detected with a YOLOv5 model and then cropped to
+create the sample input.
+
+![CK Lady](https://github.com/swdee/go-rknnlite-data/raw/master/reid-walking.jpg)
+
+Objects A1 to A4 represent the same person and objects B1, C1, and D1 are other
+people from the same scene.
+
+The first set of comparisons:
+```
+Comparing object 0 [A1] at (0,0,134,361)
+  Object 0 [A1] at (0,0,134,361) has euclidean distance: 0.000000 (same person)
+  Object 1 [A2] at (134,0,251,325) has euclidean distance: 0.423271 (same person)
+  Object 2 [A3] at (251,0,326,208) has euclidean distance: 0.465061 (same person)
+  Object 3 [A4] at (326,0,394,187) has euclidean distance: 0.445583 (same person)
+```
+
+Object 0 is A1, when compared to itself it has a euclidean distance of 0.0.  
+Objects 1-3 are A2 to A4, each of these have a similar
+distance ranging from 0.42 to 0.46.
+
+A euclidean distance range is from 0.0 (same object) to 1.0 (different object), so
+the lower the distance the more similar the object is.    A threshold of `0.51` 
+is used to define what the maximum distance can be for the object to be considered
+the same or different.    Your use case and datasets may require calibration of
+the ideal threshold.
+
+The remaining results compare the people B1, C1, and D1.
+```
+Comparing object 1 [B1] at (394,0,513,357)
+  Object 0 [A1] at (0,0,134,361) has euclidean distance: 0.781510 (different person)
+  Object 1 [A2] at (134,0,251,325) has euclidean distance: 0.801649 (different person)
+  Object 2 [A3] at (251,0,326,208) has euclidean distance: 0.680299 (different person)
+  Object 3 [A4] at (326,0,394,187) has euclidean distance: 0.686542 (different person)
+Comparing object 2 [C1] at (513,0,588,246)
+  Object 0 [A1] at (0,0,134,361) has euclidean distance: 0.860921 (different person)
+  Object 1 [A2] at (134,0,251,325) has euclidean distance: 0.873663 (different person)
+  Object 2 [A3] at (251,0,326,208) has euclidean distance: 0.870753 (different person)
+  Object 3 [A4] at (326,0,394,187) has euclidean distance: 0.820761 (different person)
+Comparing object 3 [D1] at (588,0,728,360)
+  Object 0 [A1] at (0,0,134,361) has euclidean distance: 0.762738 (different person)
+  Object 1 [A2] at (134,0,251,325) has euclidean distance: 0.800668 (different person)
+  Object 2 [A3] at (251,0,326,208) has euclidean distance: 0.763694 (different person)
+  Object 3 [A4] at (326,0,394,187) has euclidean distance: 0.769597 (different person)
+```
+
+All of these other people have a euclidean distance greater than 0.68 indicating
+they are different people.
+
+
+## Postprocessing
+
+[Convenience functions](https://github.com/swdee/go-rknnlite-data/raw/master/postprocess/reid.go) 
+are provided for calculating the Euclidean Distance or Cosine Similarity 
+depending on how the Model has been trained.
--- a/example/reid/reid.go
+++ b/example/reid/reid.go
@@ -0,0 +1,524 @@
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"github.com/swdee/go-rknnlite"
+	"github.com/swdee/go-rknnlite/postprocess/reid"
+	"gocv.io/x/gocv"
+	"image"
+	"log"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+)
+
+func main() {
+	// disable logging timestamps
+	log.SetFlags(0)
+
+	// read in cli flags
+	modelFile := flag.String("m", "../data/models/rk3588/osnet-market1501-batch8-rk3588.rknn", "RKNN compiled model file")
+	imgFile := flag.String("i", "../data/reid-walking.jpg", "Image file to run inference on")
+	objsFile := flag.String("d", "../data/reid-objects.dat", "Data file containing object co-ordinates")
+	rkPlatform := flag.String("p", "rk3588", "Rockchip CPU Model number [rk3562|rk3566|rk3568|rk3576|rk3582|rk3582|rk3588]")
+	euDist := flag.Float64("e", 0.51, "The Euclidean distance [0.0-1.0], a value less than defines a match")
+	flag.Parse()
+
+	err := rknnlite.SetCPUAffinityByPlatform(*rkPlatform, rknnlite.FastCores)
+
+	if err != nil {
+		log.Printf("Failed to set CPU Affinity: %v", err)
+	}
+
+	// check if user specified model file or if default is being used.  if default
+	// then pick the default platform model to use.
+	if f := flag.Lookup("m"); f != nil && f.Value.String() == f.DefValue && *rkPlatform != "rk3588" {
+		*modelFile = strings.ReplaceAll(*modelFile, "rk3588", *rkPlatform)
+	}
+
+	// create rknn runtime instance
+	rt, err := rknnlite.NewRuntimeByPlatform(*rkPlatform, *modelFile)
+
+	if err != nil {
+		log.Fatal("Error initializing RKNN runtime: ", err)
+	}
+
+	// set runtime to leave output tensors as int8
+	rt.SetWantFloat(false)
+
+	// optional querying of model file tensors and SDK version for printing
+	// to stdout.  not necessary for production inference code
+	err = rt.Query(os.Stdout)
+
+	if err != nil {
+		log.Fatal("Error querying runtime: ", err)
+	}
+
+	// load objects file
+	objs, err := ParseObjects(*objsFile)
+
+	if err != nil {
+		log.Fatal("Error parsing objects: ", err)
+	}
+
+	// load image
+	img := gocv.IMRead(*imgFile, gocv.IMReadColor)
+
+	if img.Empty() {
+		log.Fatal("Error reading image from: ", *imgFile)
+	}
+
+	// convert colorspace
+	srcImg := gocv.NewMat()
+	gocv.CvtColor(img, &srcImg, gocv.ColorBGRToRGB)
+
+	defer img.Close()
+	defer srcImg.Close()
+
+	start := time.Now()
+
+	// create a batch to process all images in the compare and dataset's
+	// in a single forward pass
+	batch := rknnlite.NewBatch(
+		int(rt.InputAttrs()[0].Dims[0]),
+		int(rt.InputAttrs()[0].Dims[2]),
+		int(rt.InputAttrs()[0].Dims[1]),
+		int(rt.InputAttrs()[0].Dims[3]),
+		rt.GetInputTypeFloat32(),
+	)
+
+	// scale size is the size of the input tensor dimensions to scale the object too
+	scaleSize := image.Pt(int(rt.InputAttrs()[0].Dims[1]), int(rt.InputAttrs()[0].Dims[2]))
+
+	// add the compare images to the batch
+	for _, cmpObj := range objs.Compare {
+		err := AddObjectToBatch(batch, srcImg, cmpObj, scaleSize)
+
+		if err != nil {
+			log.Fatal("Error creating batch: ", err)
+		}
+	}
+
+	// add the dataset images to the batch
+	for _, dtObj := range objs.Dataset {
+		err := AddObjectToBatch(batch, srcImg, dtObj, scaleSize)
+
+		if err != nil {
+			log.Fatal("Error creating batch: ", err)
+		}
+	}
+
+	defer batch.Close()
+
+	endBatch := time.Now()
+
+	// run inference on the batch
+	outputs, err := rt.Inference([]gocv.Mat{batch.Mat()})
+
+	endInference := time.Now()
+
+	if err != nil {
+		log.Fatal("Runtime inferencing failed with error: ", err)
+	}
+
+	// get total number of compare objects
+	totalCmp := len(objs.Compare)
+
+	// compare each object to those objects in the dataset for similarity
+	for i, cmpObj := range objs.Compare {
+		// get the compare objects output
+		cmpOutput, err := batch.GetOutputInt(i, outputs.Output[0], int(outputs.OutputAttributes().DimForDFL))
+
+		if err != nil {
+			log.Fatal("Getting output tensor failed with error: ", err)
+		}
+
+		log.Printf("Comparing object %d at (%d,%d,%d,%d)\n", i,
+			cmpObj.X1, cmpObj.Y1, cmpObj.X2, cmpObj.Y2)
+
+		for j, dtObj := range objs.Dataset {
+			// get each objects outputs
+			nextOutput, err := batch.GetOutputInt(totalCmp+j, outputs.Output[0], int(outputs.OutputAttributes().DimForDFL))
+
+			if err != nil {
+				log.Fatal("Getting output tensor failed with error: ", err)
+			}
+
+			dist := CompareObjects(
+				cmpOutput,
+				nextOutput,
+				outputs.OutputAttributes().Scales[0],
+				outputs.OutputAttributes().ZPs[0],
+			)
+
+			// check euclidean distance to determine match of same person or not
+			objRes := "different person"
+
+			if dist < float32(*euDist) {
+				objRes = "same person"
+			}
+
+			log.Printf("  Object %d at (%d,%d,%d,%d) has euclidean distance: %f (%s)\n",
+				j,
+				dtObj.X1, dtObj.Y1, dtObj.X2, dtObj.Y2,
+				dist, objRes)
+		}
+	}
+
+	endCompare := time.Now()
+
+	log.Printf("Model first run speed: batch preparation=%s, inference=%s, post processing=%s, total time=%s\n",
+		endBatch.Sub(start).String(),
+		endInference.Sub(endBatch).String(),
+		endCompare.Sub(endInference).String(),
+		endCompare.Sub(start).String(),
+	)
+
+	// free outputs allocated in C memory after you have finished post processing
+	err = outputs.Free()
+
+	if err != nil {
+		log.Fatal("Error freeing Outputs: ", err)
+	}
+
+	// close runtime and release resources
+	err = rt.Close()
+
+	if err != nil {
+		log.Fatal("Error closing RKNN runtime: ", err)
+	}
+
+	log.Println("done")
+
+	/*
+		//CompareObject(rt, srcImg, cmpObj, objs.Dataset)
+
+		//rgbImg := img.Clone()
+
+
+
+		frameWidth := 67
+		frameHeight := 177
+
+		roiRect1 := image.Rect(497, 195, 497+frameWidth, 195+frameHeight)
+
+		// cklady
+		//roiRect1 := image.Rect(0, 0, 134, 361)
+
+		roiImg1 := rgbImg.Region(roiRect1)
+
+		cropImg1 := rgbImg.Clone()
+		scaleSize1 := image.Pt(int(rt.InputAttrs()[0].Dims[1]), int(rt.InputAttrs()[0].Dims[2]))
+		gocv.Resize(roiImg1, &cropImg1, scaleSize1, 0, 0, gocv.InterpolationArea)
+
+		defer img.Close()
+		defer rgbImg.Close()
+		defer cropImg1.Close()
+		defer roiImg1.Close()
+
+		gocv.IMWrite("/tmp/frame-master.jpg", cropImg1)
+
+		batch := rt.NewBatch(
+			int(rt.InputAttrs()[0].Dims[0]),
+			int(rt.InputAttrs()[0].Dims[2]),
+			int(rt.InputAttrs()[0].Dims[1]),
+			int(rt.InputAttrs()[0].Dims[3]),
+		)
+		err = batch.Add(cropImg1)
+
+		if err != nil {
+			log.Fatal("Error creating batch: ", err)
+		}
+		defer batch.Close()
+
+		// perform inference on image file
+		outputs, err := rt.Inference([]gocv.Mat{batch.Mat()})
+
+		if err != nil {
+			log.Fatal("Runtime inferencing failed with error: ", err)
+		}
+
+		output, err := batch.GetOutputInt(0, outputs.Output[0], int(outputs.OutputAttributes().DimForDFL))
+
+		if err != nil {
+			log.Fatal("Getting output tensor failed with error: ", err)
+		}
+
+		fingerPrint := DequantizeAndL2Normalize(
+			output,
+			outputs.OutputAttributes().Scales[0],
+			outputs.OutputAttributes().ZPs[0],
+		)
+
+		// seed the EMA fingerprint to the master
+		emaFP := make([]float32, len(fingerPrint))
+		copy(emaFP, fingerPrint)
+		const alpha = 0.9 // smoothing factor
+
+		hash, err := FingerprintHash(fingerPrint)
+
+		if err != nil {
+			log.Fatalf("hashing failed: %v", err)
+		}
+
+		log.Println("object fingerprint:", hash)
+
+		// free outputs allocated in C memory after you have finished post processing
+		err = outputs.Free()
+
+		if err != nil {
+			log.Fatal("Error freeing Outputs: ", err)
+		}
+
+
+		// sample 2 images
+
+		yOffsets := []int{1, 195, 388}
+		xOffsets := []int{497, 565, 633, 701, 769, 836, 904}
+
+		images := [][]int{}
+
+		for _, ny := range yOffsets {
+			for _, nx := range xOffsets {
+				images = append(images, []int{nx, ny})
+			}
+		}
+
+		// ck lady
+
+		//	images := [][]int{
+		//		{134, 0, 117, 325},
+		//		{251, 0, 75, 208},
+		//		{326, 0, 68, 187},
+		//	}
+
+
+		// Image 2
+		for frame, next := range images {
+
+			roiRect2 := image.Rect(next[0], next[1], next[0]+frameWidth, next[1]+frameHeight)
+			// ck lady
+			//roiRect2 := image.Rect(next[0], next[1], next[0]+next[2], next[1]+next[3])
+			roiImg2 := rgbImg.Region(roiRect2)
+
+			cropImg2 := rgbImg.Clone()
+			scaleSize2 := image.Pt(int(rt.InputAttrs()[0].Dims[1]), int(rt.InputAttrs()[0].Dims[2]))
+			gocv.Resize(roiImg2, &cropImg2, scaleSize2, 0, 0, gocv.InterpolationArea)
+
+			defer cropImg2.Close()
+			defer roiImg2.Close()
+
+			gocv.IMWrite(fmt.Sprintf("/tmp/frame-%d.jpg", frame), cropImg2)
+
+			start := time.Now()
+
+			batch.Clear()
+			err = batch.Add(cropImg2)
+
+			if err != nil {
+				log.Fatal("Error creating batch: ", err)
+			}
+
+			outputs, err = rt.Inference([]gocv.Mat{batch.Mat()})
+
+			if err != nil {
+				log.Fatal("Runtime inferencing failed with error: ", err)
+			}
+
+			endInference := time.Now()
+
+			output, err := batch.GetOutputInt(0, outputs.Output[0], int(outputs.OutputAttributes().DimForDFL))
+
+			if err != nil {
+				log.Fatal("Getting output tensor failed with error: ", err)
+			}
+
+			fingerPrint2 := DequantizeAndL2Normalize(
+				output,
+				outputs.OutputAttributes().Scales[0],
+				outputs.OutputAttributes().ZPs[0],
+			)
+
+
+			//	sim := CosineSimilarity(fingerPrint, fingerPrint2)
+			//	dist := CosineDistance(fingerPrint, fingerPrint2)
+			//	fmt.Printf("Frame %d, cosine similarity: %f,  distance=%f\n", frame, sim, dist)
+
+
+			// compute Euclidean (L2) distance directly
+			dist := EuclideanDistance(fingerPrint, fingerPrint2)
+
+			// 3) compute vs EMA
+			emaDist := EuclideanDistance(emaFP, fingerPrint2)
+
+			endDetect := time.Now()
+
+			objRes := "different person"
+			if emaDist < 0.51 {
+				objRes = "same person"
+			}
+
+			fmt.Printf("Frame %d, euclidean distance: %f, ema=%f (%s)\n", frame, dist, emaDist, objRes)
+
+			log.Printf(" Inference=%s, detect=%s, total time=%s\n",
+				endInference.Sub(start).String(),
+				endDetect.Sub(endInference).String(),
+				endDetect.Sub(start).String(),
+			)
+
+			// free outputs allocated in C memory after you have finished post processing
+			err = outputs.Free()
+
+			if err != nil {
+				log.Fatal("Error freeing Outputs: ", err)
+			}
+
+			// 4) update the EMA fingerprint
+			if frame >= 7 && frame <= 13 {
+
+				//    emaFP = α*emaFP + (1-α)*fp2
+				for i := range emaFP {
+					emaFP[i] = alpha*emaFP[i] + (1-alpha)*fingerPrint2[i]
+				}
+				// 5) re‐normalize emaFP back to unit length
+				var sum float32
+				for _, v := range emaFP {
+					sum += v * v
+				}
+				norm := float32(math.Sqrt(float64(sum)))
+				if norm > 0 {
+					for i := range emaFP {
+						emaFP[i] /= norm
+					}
+				}
+			}
+
+		}
+
+		// close runtime and release resources
+		err = rt.Close()
+
+		if err != nil {
+			log.Fatal("Error closing RKNN runtime: ", err)
+		}
+
+		log.Println("done")
+	*/
+}
+
+// Box holds object bounding box coordinates (x1, y1, x2, y2)
+type Box struct {
+	X1, Y1, X2, Y2 int
+}
+
+// Objects is a struct to represent the compare and dataset objects parsed
+// from the objects data file
+type Objects struct {
+	Compare []Box
+	Dataset []Box
+}
+
+// ParseObjects reads the TOML-like objects data file returns the two lists
+// of objects and their bounding box coordinates
+func ParseObjects(path string) (*Objects, error) {
+
+	f, err := os.Open(path)
+
+	if err != nil {
+		return nil, err
+	}
+
+	defer f.Close()
+
+	objs := &Objects{}
+	section := "" // either "compare" or "dataset"
+	scanner := bufio.NewScanner(f)
+
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+
+		// skip blank or comment
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+
+		// section header
+		if strings.HasPrefix(line, "[") && strings.HasSuffix(line, "]") {
+			section = strings.ToLower(line[1 : len(line)-1])
+			continue
+		}
+
+		// data line, expect four ints separated by commas
+		fields := strings.Split(line, ",")
+
+		if len(fields) != 4 {
+			return nil, fmt.Errorf("invalid data line %q", line)
+		}
+
+		nums := make([]int, 4)
+
+		for i, fstr := range fields {
+			v, err := strconv.Atoi(strings.TrimSpace(fstr))
+
+			if err != nil {
+				return nil, fmt.Errorf("parsing %q: %w", fstr, err)
+			}
+
+			nums[i] = v
+		}
+
+		// define box
+		box := Box{nums[0], nums[1], nums[2], nums[3]}
+
+		switch section {
+
+		case "compare":
+			objs.Compare = append(objs.Compare, box)
+
+		case "dataset":
+			objs.Dataset = append(objs.Dataset, box)
+
+		default:
+			return nil, fmt.Errorf("line %q outside of a known section", line)
+		}
+	}
+
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+
+	return objs, nil
+}
+
+// AddObjectToBatch adds the cropped object from source image to the batch for
+// running inference on
+func AddObjectToBatch(batch *rknnlite.Batch, srcImg gocv.Mat, obj Box,
+	scaleSize image.Point) error {
+
+	// get the objects region of interest from source Mat
+	objRect := image.Rect(obj.X1, obj.Y1, obj.X2, obj.Y2)
+	objRoi := srcImg.Region(objRect)
+
+	objImg := objRoi.Clone()
+	gocv.Resize(objRoi, &objImg, scaleSize, 0, 0, gocv.InterpolationArea)
+
+	defer objRoi.Close()
+	defer objImg.Close()
+
+	return batch.Add(objImg)
+}
+
+// CompareObjects compares the outputs of two objects
+func CompareObjects(objA []int8, objB []int8, scales float32,
+	ZPs int32) float32 {
+
+	// get the fingerprint of both objects
+	fpA := reid.DequantizeAndL2Normalize(objA, scales, ZPs)
+	fpB := reid.DequantizeAndL2Normalize(objB, scales, ZPs)
+
+	// compute Euclidean (L2) distance directly
+	return reid.EuclideanDistance(fpA, fpB)
+}
--- a/example/stream/README.md
+++ b/example/stream/README.md
@@ -313,6 +313,19 @@ itself is not 100%.   Whilst this demo shows a complete solution, you would stil
 need to do work to train a better model and testing for your own use case. 


+## Re-Identification (ReID)
+
+Experimental ReID has been added which follows the implementation of the 
+[FairMOT](https://github.com/FoundationVision/ByteTrack/tree/main/tutorials/fairmot) tracker,
+however this makes use of the OSNet model trained with the Market1501 dataset.
+
+Usage of ReID is expensive and typically takes around 200ms per frame to complete
+on the RK3588 NPU.   There is little accuracy improvement over straight ByteTrack
+which adds little overhead to the YOLO object detection.
+
+We need to wait for Rockchips next generation RK36xx SoC before this may be useful.
+
+
 ## Background

 The ByteTrack code is a Go conversion of the [C++ project](https://github.com/ifzhang/ByteTrack). 
--- a/example/stream/bytetrack.go
+++ b/example/stream/bytetrack.go
@@ -127,18 +127,27 @@ type Demo struct {
 	// renderFormat indicates which rendering type to use with instance
 	// segmentation, outline or mask
 	renderFormat string
+	// reidModelFile is the model to use ReID with
+	reidModelFile string
+	// reid is a flag to inidicate if reid is being used or not
+	reid bool
+	// reidPool of rknnlite runtimes to perform inference in parallel
+	reidPool *rknnlite.Pool
 }

 // NewDemo returns and instance of Demo, a streaming HTTP server showing
 // video with object detection
 func NewDemo(vidSrc *VideoSource, modelFile, labelFile string, poolSize int,
-	modelType string, renderFormat string, rkPlatform string) (*Demo, error) {
+	modelType string, renderFormat string, rkPlatform string,
+	reidModelFile string, useReid bool) (*Demo, error) {

 	var err error

 	d := &Demo{
-		vidSrc:    vidSrc,
-		limitObjs: make([]string, 0),
+		vidSrc:        vidSrc,
+		limitObjs:     make([]string, 0),
+		reidModelFile: reidModelFile,
+		reid:          useReid,
 	}

 	if vidSrc.Format == VideoFile {
@@ -220,6 +229,15 @@ func NewDemo(vidSrc *VideoSource, modelFile, labelFile string, poolSize int,
 		log.Printf("***WARNING*** %s only has 1 TOPS NPU, downgraded to %d FPS\n", rkPlatform, FPS)
 	}

+	if d.reid {
+		if strings.EqualFold(rkPlatform[:5], "rk356") {
+			log.Fatal("***WARNING*** ReID is unavailable for RK356x platforms as the 1 TOPS NPU is not powerful enough")
+		}
+		FPS = 4
+		FPSinterval = time.Duration(float64(time.Second) / float64(FPS))
+		log.Println("***WARNING*** ReID is experimental and requires alot of NPU, downgraded to 4 FPS")
+	}
+
 	// load in Model class names
 	d.labels, err = rknnlite.LoadLabels(labelFile)

@@ -227,6 +245,19 @@ func NewDemo(vidSrc *VideoSource, modelFile, labelFile string, poolSize int,
 		return nil, fmt.Errorf("Error loading model labels: %w", err)
 	}

+	// create pool for ReID
+	if d.reid {
+		d.reidPool, err = rknnlite.NewPool(poolSize, reidModelFile,
+			[]rknnlite.CoreMask{rknnlite.NPUCoreAuto})
+
+		if err != nil {
+			log.Fatalf("Error creating ReID RKNN pool: %v\n", err)
+		}
+
+		// set runtime to leave output tensors as int8
+		d.reidPool.SetWantFloat(false)
+	}
+
 	return d, nil
 }

@@ -360,6 +391,10 @@ func (d *Demo) Stream(w http.ResponseWriter, r *http.Request) {
 	// record of past object detections for tracking
 	byteTrack := tracker.NewBYTETracker(FPS, FPS*10, 0.5, 0.6, 0.8)

+	if d.reid {
+		byteTrack.UseReID(d.reidPool, tracker.Euclidean, 0.51)
+	}
+
 	// create a trails history
 	trail := tracker.NewTrail(90)

@@ -491,9 +526,18 @@ func (d *Demo) ProcessFrame(img gocv.Mat, retChan chan<- ResultFrame,
 	// track detected objects
 	timing.TrackerStart = time.Now()

-	trackObjs, err := byteTrack.Update(
-		postprocess.DetectionsToObjects(detectResults),
-	)
+	var trackObjs []*tracker.STrack
+
+	if d.reid {
+		trackObjs, err = byteTrack.UpdateWithFrame(
+			postprocess.DetectionsToObjects(detectResults),
+			resImg,
+		)
+	} else {
+		trackObjs, err = byteTrack.Update(
+			postprocess.DetectionsToObjects(detectResults),
+		)
+	}

 	timing.TrackerEnd = time.Now()

@@ -713,6 +757,8 @@ func main() {
 	renderFormat := flag.String("r", "outline", "The rendering format used for instance segmentation [outline|mask]")
 	codecFormat := flag.String("codec", "mjpg", "Web Camera codec The rendering format [mjpg|yuyv]")
 	rkPlatform := flag.String("p", "rk3588", "Rockchip CPU Model number [rk3562|rk3566|rk3568|rk3576|rk3582|rk3582|rk3588]")
+	reidModelFile := flag.String("rm", "../data/models/rk3588/osnet-market1501-batch8-rk3588.rknn", "RKNN compiled OSNet/Re-Identification model file")
+	useReid := flag.Bool("reid", false, "Enable Re-Identification enhanced tracking")

 	// Initialize the custom camera resolution flag with a default value
 	cameraRes := &cameraResFlag{value: "1280x720@30"}
@@ -760,8 +806,12 @@ func main() {
 		*modelFile = strings.ReplaceAll(*modelFile, "rk3588", *rkPlatform)
 	}

+	if f := flag.Lookup("rm"); f != nil && f.Value.String() == f.DefValue && *rkPlatform != "rk3588" {
+		*reidModelFile = strings.ReplaceAll(*reidModelFile, "rk3588", *rkPlatform)
+	}
+
 	demo, err := NewDemo(vidSrc, *modelFile, *labelFile, *poolSize,
-		*modelType, *renderFormat, *rkPlatform)
+		*modelType, *renderFormat, *rkPlatform, *reidModelFile, *useReid)

 	if err != nil {
 		log.Fatalf("Error creating demo: %v", err)
--- a/inference.go
+++ b/inference.go
@@ -60,7 +60,7 @@ func (r *Runtime) Inference(mats []gocv.Mat) (*Outputs, error) {
 				Index: uint32(idx),
 				Type:  TensorFloat32,
 				// multiply by 4 for size of float32
-				Size:        uint32(mat.Cols() * mat.Rows() * mat.Channels() * 4),
+				Size:        uint32(len(data) * 4), // bytes = elements * 4
 				Fmt:         TensorNHWC,
 				Buf:         unsafe.Pointer(&data[0]),
 				PassThrough: false,
@@ -77,7 +77,7 @@ func (r *Runtime) Inference(mats []gocv.Mat) (*Outputs, error) {
 			inputs[idx] = Input{
 				Index:       uint32(idx),
 				Type:        TensorUint8,
-				Size:        uint32(mat.Cols() * mat.Rows() * mat.Channels()),
+				Size:        uint32(len(data)), // bytes = elements
 				Fmt:         TensorNHWC,
 				Buf:         unsafe.Pointer(&data[0]),
 				PassThrough: false,
--- a/pool.go
+++ b/pool.go
@@ -109,6 +109,11 @@ func (p *Pool) SetWantFloat(val bool) {
 	}
 }

+// Size returns the Pool size
+func (p *Pool) Size() int {
+	return p.size
+}
+
 // getRuntimeCore takes an integer and returns the core mask value to use from
 // the coremask list
 func getRuntimeCore(i int, cores []CoreMask) CoreMask {
--- a/postprocess/reid/reid.go
+++ b/postprocess/reid/reid.go
@@ -0,0 +1,129 @@
+package reid
+
+import (
+	"bytes"
+	"crypto/sha256"
+	"encoding/binary"
+	"encoding/hex"
+	"math"
+)
+
+// DequantizeAndL2Normalize converts a quantized int8 vector "q" into a float32 vector,
+// applies dequantization using the provided scale "s" and zero-point "z",
+// and then normalizes the result to unit length using L2 normalization.
+//
+// This is commonly used to convert quantized embedding vectors back to a
+// normalized float form for comparison or similarity calculations.
+//
+// If the resulting vector has zero magnitude, the function returns the
+// unnormalized dequantized vector.
+func DequantizeAndL2Normalize(q []int8, s float32, z int32) []float32 {
+
+	N := len(q)
+	x := make([]float32, N)
+
+	// dequantize
+	for i := 0; i < N; i++ {
+		x[i] = float32(int32(q[i])-z) * s
+	}
+
+	// compute L2 norm
+	var sumSquares float32
+
+	for _, v := range x {
+		sumSquares += v * v
+	}
+
+	norm := float32(math.Sqrt(float64(sumSquares)))
+
+	if norm == 0 {
+		// avoid /0
+		return x
+	}
+
+	// normalize
+	for i := 0; i < N; i++ {
+		x[i] /= norm
+	}
+
+	return x
+}
+
+// FingerprintHash takes an L2-normalized []float32 and returns
+// a hex-encoded SHA-256 hash of its binary representation.
+func FingerprintHash(feat []float32) (string, error) {
+
+	buf := new(bytes.Buffer)
+
+	// write each float32 in little‐endian
+	for _, v := range feat {
+		if err := binary.Write(buf, binary.LittleEndian, v); err != nil {
+			return "", err
+		}
+	}
+
+	sum := sha256.Sum256(buf.Bytes())
+
+	return hex.EncodeToString(sum[:]), nil
+}
+
+// CosineSimilarity returns the cosine of the angle between vectors a and b.
+// Assumes len(a)==len(b). If you have already L2‐normalized them,
+// this is just their dot-product.
+func CosineSimilarity(a, b []float32) float32 {
+
+	var dot float32
+
+	for i := range a {
+		dot += a[i] * b[i]
+	}
+
+	// If not already normalized, you’d divide by norms here.
+	return dot
+}
+
+// CosineDistance returns 1 – cosine similarity, which is a proper distance metric
+// in [0,2]. For L2-normalized vectors this is in [0,2], and small values mean
+// "very similar."
+func CosineDistance(a, b []float32) float32 {
+	return 1 - CosineSimilarity(a, b)
+}
+
+// EuclideanDistance returns the L2 distance between two vectors.
+// Lower means "more similar" when your features are L2-normalized.
+func EuclideanDistance(a, b []float32) float32 {
+	var sum float32
+
+	for i := range a {
+		d := a[i] - b[i]
+		sum += d * d
+	}
+
+	return float32(math.Sqrt(float64(sum)))
+}
+
+// NormalizeVec normalizes the input float32 slice to unit length and returns
+// a new slice. If the input vector has zero magnitude, it returns the original
+// slice unchanged.
+func NormalizeVec(v []float32) []float32 {
+
+	norm := float32(0.0)
+
+	for _, x := range v {
+		norm += x * x
+	}
+
+	if norm == 0 {
+		return v // avoid division by zero
+	}
+
+	norm = float32(math.Sqrt(float64(norm)))
+
+	out := make([]float32, len(v))
+
+	for i, x := range v {
+		out[i] = x / norm
+	}
+
+	return out
+}
--- a/runtime.go
+++ b/runtime.go
@@ -241,6 +241,11 @@ func (r *Runtime) SetInputTypeFloat32(val bool) {
 	r.inputTypeFloat32 = val
 }

+// GetInputTypeFloat32 returns the input type if set as Float32 (true) or Int8 (false)
+func (r *Runtime) GetInputTypeFloat32() bool {
+	return r.inputTypeFloat32
+}
+
 // SDKVersion represents the C.rknn_sdk_version struct
 type SDKVersion struct {
 	DriverVersion string
--- a/toolkit/Dockerfile
+++ b/toolkit/Dockerfile
@@ -73,9 +73,6 @@ RUN git clone --depth 1 https://github.com/swdee/lpd-yolov8.git /opt/lpd-yolov8
    cp /opt/rknn_model_zoo/examples/yolov8/python/convert.py /opt/rknn_model_zoo/examples/yolov8/python/convert-lpd.py && \
    sed -i "s|^DATASET_PATH *= *['\"].*['\"]|DATASET_PATH = '/opt/lpd-yolov8/subset.txt'|" /opt/rknn_model_zoo/examples/yolov8/python/convert-lpd.py

-# download other onnx models
-RUN git clone --depth 1 https://github.com/swdee/go-rknnlite-build.git /opt/go-rknnlite-build
-
 # Upgrade pip to the latest version
 RUN pip install --upgrade pip

@@ -92,6 +89,15 @@ RUN pip install --no-cache-dir \
    pyyaml \
    "tensorflow<=2.16.0rc0"

+# download other onnx models
+RUN git clone --depth 1 https://github.com/swdee/go-rknnlite-build.git /opt/go-rknnlite-build && \
+    git -C /opt/go-rknnlite-build fetch --depth 1 origin ce8b5ce1dc53b1c38324e7506374731ad21070c8 && \
+    git -C /opt/go-rknnlite-build checkout FETCH_HEAD
+
+# copy our modified mobilenet.py script into the rknn_model_zoo directory
+RUN cp /opt/go-rknnlite-build/mobilenet-batch/mobilenet-rknn.py /opt/rknn_model_zoo/examples/mobilenet/python/mobilenet-rknn-batch.py
+
+
 # By default do nothing
 CMD ["bash"]

--- a/toolkit/compile-models.sh
+++ b/toolkit/compile-models.sh
@@ -30,6 +30,8 @@ MODELS=(
  "mobilenet_v1      rknn_convert      /opt/models/mobilenet_v1/model_config.yml    ''    ''    mobilenet_v1"
  "yolov8            convert-lpd.py    /opt/lpd-yolov8/lpd-yolov8n.onnx             i8    ''    lpd-yolov8n"
  "yolov8            convert.py        /opt/go-rknnlite-build/yolonas-s.onnx        i8    ''    yolonas-s"
+  "mobilenet         mobilenet-rknn-batch.py      ../model/mobilenetv2-12.onnx      i8    --model     mobilenetv2-batch8"
+  "osnet-market1501  build|onnx_to_rknn.py        osnet_x1_0_market_256x128.onnx    i8    ''          osnet-market1501-batch8"
 )

 # compile all entries (or just filter) for one platform
@@ -75,6 +77,7 @@ compile_for_platform() {
    fi

    echo "-> building $outprefix for $platform"
+    local out="/opt/rkmodels/${platform}/${outprefix}-${platform}.rknn"

    if [[ "$script" == "rknn_convert" ]]; then
      # mobilenet_v1 special: use the CLI and then rename
@@ -83,13 +86,23 @@ compile_for_platform() {
        -i "$model" \
        -o "/opt/rkmodels/$platform/"
      mv "/opt/rkmodels/$platform/${outprefix}.rknn" \
-         "/opt/rkmodels/$platform/${outprefix}-${platform}.rknn"
+         "$out"
+      continue
+    fi
+
+    # build the go-rknnlite-build models
+    if [[ "$script" == build\|* ]]; then
+      # strip everything up to (and including) the first pipe to get script name
+      scriptName="${script#*|}"
+      # go into the go-rknnlite-build tree
+      pushd "/opt/go-rknnlite-build/${subdir}" >/dev/null
+        python "$scriptName" "$model" "$platform" "$dtype" "$out"
+      popd >/dev/null
      continue
    fi

    # the old examples
    pushd "/opt/rknn_model_zoo/examples/${subdir}/python/" >/dev/null
-    local out="/opt/rkmodels/${platform}/${outprefix}-${platform}.rknn"

    if [[ "$subdir" == "mobilenet" ]]; then
      python "$script" $extra "$model" \
--- a/tracker/bytetracker.go
+++ b/tracker/bytetracker.go
@@ -25,6 +25,10 @@ type BYTETracker struct {
 	lostStracks []*STrack
 	// List of removed objects
 	removedStracks []*STrack
+	// reid supported tracking
+	reid *reID
+	// useReid is a flag to indicate if ReID supported tracking is to be used
+	useReid bool
 }

 // NewBYTETracker initializes and returns a new BYTETracker
@@ -62,6 +66,10 @@ func (bt *BYTETracker) Update(objects []Object) ([]*STrack, error) {
 		strack := NewSTrack(NewRect(object.Rect.X(), object.Rect.Y(), object.Rect.Width(), object.Rect.Height()),
 			object.Prob, object.ID, object.Label)

+		if bt.useReid {
+			strack.WithFeature(object.Feature, 0.9, 30)
+		}
+
 		if object.Prob >= bt.trackThresh {
 			detStracks = append(detStracks, strack)
 		} else {
@@ -87,11 +95,18 @@ func (bt *BYTETracker) Update(objects []Object) ([]*STrack, error) {
 		strack.Predict()
 	}

-	// Step 2: First association, with IoU
+	// Step 2: First association, using IoU or feature distance matching
 	var currentTrackedStracks, remainTrackedStracks, remainDetStracks, refindStracks []*STrack
+	var costMatrix [][]float32
+
+	if bt.useReid {
+		costMatrix = bt.calcFeatureDistance(strackPool, detStracks)
+	} else {
+		costMatrix = bt.calcIouDistance(strackPool, detStracks)
+	}

 	matchesIdx, unmatchTrackIdx, unmatchDetectionIdx, err := bt.linearAssignment(
-		bt.calcIouDistance(strackPool, detStracks),
+		costMatrix,
 		len(strackPool), len(detStracks), bt.matchThresh,
 	)

@@ -126,7 +141,8 @@ func (bt *BYTETracker) Update(objects []Object) ([]*STrack, error) {
 		}
 	}

-	// Step 3: Second association, using low score dets
+	// Step 3: IoU fallback matching for unmatched tracks,
+	// using low score IOU detections
 	var currentLostStracks []*STrack

 	matchesIdx, unmatchTrackIdx, unmatchDetectionIdx, err = bt.linearAssignment(
@@ -162,6 +178,7 @@ func (bt *BYTETracker) Update(objects []Object) ([]*STrack, error) {
 	}

 	// Step 4: Init new stracks
+	// Match non-active to unmatched remainingDetStracks (high confidence only)
 	var currentRemovedStracks []*STrack

 	matchesIdx, unmatchUnconfirmedIdx, unmatchDetectionIdx, err := bt.linearAssignment(
@@ -197,7 +214,7 @@ func (bt *BYTETracker) Update(objects []Object) ([]*STrack, error) {
 		currentTrackedStracks = append(currentTrackedStracks, track)
 	}

-	// Step 5: Update state
+	// Step 5: Update state - Time-based removal of old lost tracks
 	for _, lostStrack := range bt.lostStracks {
 		if bt.frameID-lostStrack.GetFrameID() > bt.maxTimeLost {
 			lostStrack.MarkAsRemoved()
@@ -508,3 +525,21 @@ func (bt *BYTETracker) execLapjv(cost [][]float32, extendCost bool,

 	return rowsol, colsol, opt, nil
 }
+
+// calcFeatureDistance calculates the distance between two embedded features
+// of the specified STracks
+func (bt *BYTETracker) calcFeatureDistance(tracks, detections []*STrack) [][]float32 {
+
+	cost := make([][]float32, len(tracks))
+
+	for i, tr := range tracks {
+
+		cost[i] = make([]float32, len(detections))
+
+		for j, det := range detections {
+			cost[i][j] = tr.BestMatchDistance(det.feature)
+		}
+	}
+
+	return cost
+}
--- a/tracker/object.go
+++ b/tracker/object.go
@@ -11,6 +11,8 @@ type Object struct {
 	// ID is a unique ID to give this object which can be used to match
 	// the input detection object and tracked object
 	ID int64
+	// Feature is a ReID embedding feature
+	Feature []float32
 }

 // NewObject is a constructor function for the Object struct
--- a/tracker/reid.go
+++ b/tracker/reid.go
@@ -0,0 +1,246 @@
+package tracker
+
+import (
+	"fmt"
+	"github.com/swdee/go-rknnlite"
+	"github.com/swdee/go-rknnlite/postprocess/reid"
+	"gocv.io/x/gocv"
+	"image"
+	"sync"
+)
+
+// DistanceMethod defines ReID distance calculation methods
+type DistanceMethod int
+
+const (
+	Euclidean DistanceMethod = 1
+	Cosine    DistanceMethod = 2
+)
+
+// reID struct holds all ReIdentification processing features
+type reID struct {
+	// pool is the rknnlike runtime pool to run inference on
+	pool *rknnlite.Pool
+	// dist is the distance method to apply to calculations to determine similarity
+	dist DistanceMethod
+	// threshold is the distance cutoff to determine similar or different objects
+	threshold float32
+	// batchSize store model input tensor batch size
+	batchSize int
+	width     int
+	height    int
+	channels  int
+	// batchPools holds a pool of batches
+	batchPool *rknnlite.BatchPool
+	// scaleSize is the size of the input tensor dimensions to scale the object too
+	scaleSize image.Point
+}
+
+// UseReID sets up Re-Identification processing on the BYTETracker instance
+func (bt *BYTETracker) UseReID(pool *rknnlite.Pool, dist DistanceMethod,
+	threshold float32) {
+
+	// query runtime and get tensor dimensions
+	rt := pool.Get()
+
+	batchSize := int(rt.InputAttrs()[0].Dims[0])
+	width := int(rt.InputAttrs()[0].Dims[1])
+	height := int(rt.InputAttrs()[0].Dims[2])
+	channels := int(rt.InputAttrs()[0].Dims[3])
+
+	bt.reid = &reID{
+		pool:      pool,
+		dist:      dist,
+		threshold: threshold,
+		batchSize: batchSize,
+		width:     width,
+		height:    height,
+		channels:  channels,
+		scaleSize: image.Pt(width, height),
+		batchPool: rknnlite.NewBatchPool(pool.Size(), rt),
+	}
+
+	pool.Return(rt)
+
+	bt.useReid = true
+}
+
+// UpdateWithFrame updates the tracker with new detections and passes the
+// image frame so ReID inference can be conducted
+func (bt *BYTETracker) UpdateWithFrame(objects []Object, frame gocv.Mat) ([]*STrack, error) {
+
+	// check if ReID is enabled and get embedding features for all objects
+	if bt.useReid {
+
+		bufFrame := frame.Clone()
+		defer bufFrame.Close()
+
+		features, err := bt.reid.processObjects(objects, bufFrame)
+
+		if err != nil {
+			return nil, fmt.Errorf("failed to process objects: %w", err)
+		}
+
+		for i := range objects {
+			objects[i].Feature = features[i]
+		}
+	}
+
+	// run track update
+	tracks, err := bt.Update(objects)
+
+	if err != nil {
+		return nil, fmt.Errorf("error updating objects: %w", err)
+	}
+
+	return tracks, nil
+}
+
+// Close frees memory from reid instance
+func (r *reID) Close() {
+	r.batchPool.Close()
+}
+
+// processObjects takes the detected objects and runs inference on them to get
+// their embedded feature fingerprint.  Function should be called from a
+// Goroutine.
+func (r *reID) processObjects(objects []Object, frame gocv.Mat) ([][]float32, error) {
+
+	var wg sync.WaitGroup
+	total := len(objects)
+
+	// collect per objects feature embeddings
+	allEmbeddings := make([][]float32, total)
+	errCh := make(chan error, (total+r.batchSize-1)/r.batchSize)
+
+	for offset := 0; offset < total; offset += r.batchSize {
+
+		end := offset + r.batchSize
+
+		if end > total {
+			end = total
+		}
+
+		batchObjs := objects[offset:end]
+
+		// capture range variables for closure
+		capOffset := offset
+		capCnt := end - offset
+
+		wg.Add(1)
+		batch := r.batchPool.Get()
+		rt := r.pool.Get()
+
+		go func(rt *rknnlite.Runtime, batch *rknnlite.Batch, bobjs []Object, off, cnt int) {
+			defer wg.Done()
+			fps, err := r.processBatch(rt, batch, bobjs, frame)
+			r.pool.Return(rt)
+			r.batchPool.Return(batch)
+
+			if err != nil {
+				errCh <- err
+				return
+			}
+
+			// copy this batch’s fingerprints into correct offset place for
+			// all fingerprint results
+			for i := 0; i < cnt; i++ {
+				allEmbeddings[off+i] = fps[i]
+			}
+
+			errCh <- nil
+		}(rt, batch, batchObjs, capOffset, capCnt)
+	}
+
+	wg.Wait()
+	close(errCh)
+
+	// if any error, just bail
+	for e := range errCh {
+		if e != nil {
+			return nil, fmt.Errorf("ReID error: %w", e)
+		}
+	}
+
+	return allEmbeddings, nil
+}
+
+// processBatch adds the objects to a batch and runs inference on them
+func (r *reID) processBatch(rt *rknnlite.Runtime, batch *rknnlite.Batch,
+	bobjs []Object, frame gocv.Mat) ([][]float32, error) {
+
+	height := frame.Rows()
+	width := frame.Cols()
+
+	for _, obj := range bobjs {
+
+		// clamp and get bounding box coordinates
+		x1 := clamp(int(obj.Rect.TLX()), 0, width)
+		y1 := clamp(int(obj.Rect.TLY()), 0, height)
+		x2 := clamp(int(obj.Rect.BRX()), 0, width)
+		y2 := clamp(int(obj.Rect.BRY()), 0, height)
+
+		objRect := image.Rect(x1, y1, x2, y2)
+
+		// get the objects region of interest from source Mat
+		objRoi := frame.Region(objRect)
+		objImg := gocv.NewMat()
+
+		// resize to input tensor size
+		gocv.Resize(objRoi, &objImg, r.scaleSize, 0, 0, gocv.InterpolationArea)
+
+		objRoi.Close()
+
+		err := batch.Add(objImg)
+		objImg.Close()
+
+		if err != nil {
+			return nil, fmt.Errorf("error adding image to batch")
+		}
+	}
+
+	// run inference on the batch
+	outputs, err := rt.Inference([]gocv.Mat{batch.Mat()})
+
+	if err != nil {
+		return nil, fmt.Errorf("inference failed: %v", err)
+	}
+
+	defer outputs.Free()
+
+	// unpack per object results
+	fingerprints := make([][]float32, len(bobjs))
+
+	for idx := 0; idx < len(bobjs); idx++ {
+
+		output, err := batch.GetOutputInt(idx, outputs.Output[0], int(outputs.OutputAttributes().DimForDFL))
+
+		if err != nil {
+			return nil, fmt.Errorf("error getting output %d: %v", idx, err)
+		}
+
+		// get object fingerprint
+		fingerprints[idx] = reid.DequantizeAndL2Normalize(
+			output,
+			outputs.OutputAttributes().Scales[0],
+			outputs.OutputAttributes().ZPs[0],
+		)
+	}
+
+	return fingerprints, nil
+}
+
+// clamp restricts the value x to be within the range min and max
+func clamp(val, min, max int) int {
+
+	if val > min {
+
+		if val < max {
+			return val // casting the float to int after the comparison
+		}
+
+		return max
+	}
+
+	return min
+}
--- a/tracker/strack.go
+++ b/tracker/strack.go
@@ -2,6 +2,7 @@ package tracker

 import (
 	"fmt"
+	"github.com/swdee/go-rknnlite/postprocess/reid"
 	"gonum.org/v1/gonum/mat"
 )

@@ -47,6 +48,18 @@ type STrack struct {
 	detectionID int64
 	// label is the object label/class from yolo inference
 	label int
+	// feature embedding used for ReID
+	feature []float32
+	// smoothFeature an EMA smoothed feature embedding used for ReID
+	smoothFeature []float32
+	// featureQueue is a history of features
+	featureQueue [][]float32
+	// maxQueueSize is the featureQueue maximum size, eg: 30
+	maxQueueSize int
+	// alpha value use in EMA smoothing calculation
+	alpha float32
+	// hasFeature is a flag to indicate if WithFeature() has been set
+	hasFeature bool
 }

 // NewSTrack creates a new STrack
@@ -68,6 +81,15 @@ func NewSTrack(rect Rect, score float32, detectionID int64, label int) *STrack {
 	}
 }

+// WithFeature adds an objects embedded feature from ReID inference to the STrack
+func (s *STrack) WithFeature(feature []float32, alpha float32, qsize int) {
+	s.hasFeature = true
+	s.alpha = alpha
+	s.maxQueueSize = qsize
+	s.featureQueue = make([][]float32, 0, qsize)
+	s.UpdateFeatures(feature)
+}
+
 // GetRect returns the bounding box of the tracked object
 func (s *STrack) GetRect() *Rect {
 	return &s.rect
@@ -155,6 +177,8 @@ func (s *STrack) ReActivate(newTrack *STrack, frameID, newTrackID int) {

 	s.frameID = frameID
 	s.trackletLen = 0
+
+	s.UpdateFeatures(newTrack.feature)
 }

 // Predict predicts the next state of the track
@@ -185,6 +209,8 @@ func (s *STrack) Update(newTrack *STrack, frameID int) error {
 	s.frameID = frameID
 	s.trackletLen++

+	s.UpdateFeatures(newTrack.feature)
+
 	return nil
 }

@@ -205,3 +231,58 @@ func (s *STrack) updateRect() {
 	s.rect.SetX(s.mean[0] - s.rect.Width()/2)
 	s.rect.SetY(s.mean[1] - s.rect.Height()/2)
 }
+
+// UpdateFeatures updates an STracks ReID embedded features
+func (s *STrack) UpdateFeatures(feat []float32) {
+
+	if !s.hasFeature {
+		return
+	}
+
+	normFeat := reid.NormalizeVec(feat)
+	s.feature = normFeat
+
+	if s.smoothFeature == nil {
+		s.smoothFeature = make([]float32, len(normFeat))
+		copy(s.smoothFeature, normFeat)
+
+	} else {
+		for i := range normFeat {
+			s.smoothFeature[i] = s.alpha*s.smoothFeature[i] + (1-s.alpha)*normFeat[i]
+		}
+		s.smoothFeature = reid.NormalizeVec(s.smoothFeature)
+	}
+
+	// Enqueue the feature
+	s.featureQueue = append(s.featureQueue, normFeat)
+
+	if len(s.featureQueue) > s.maxQueueSize {
+		s.featureQueue = s.featureQueue[1:]
+	}
+}
+
+// BestMatchDistance compares a new detection against all stored past features
+func (s *STrack) BestMatchDistance(detFeat []float32) float32 {
+
+	if !s.hasFeature {
+		// feature not set so return max distance
+		return 1.0
+	}
+
+	if len(s.featureQueue) == 0 {
+		return 1.0 // max distance
+	}
+
+	detNorm := reid.NormalizeVec(detFeat)
+	best := float32(1.0)
+
+	for _, f := range s.featureQueue {
+		d := reid.EuclideanDistance(f, detNorm)
+
+		if d < best {
+			best = d
+		}
+	}
+
+	return best
+}