added midas depth estimation example

2025-12-24 10:30:56 +08:00 · 2025-12-19 22:54:34 +13:00
parent b31b9b3075
commit 23d156bae2
3 changed files with 451 additions and 0 deletions
--- a/example/midas/README.md
+++ b/example/midas/README.md
@@ -0,0 +1,92 @@
+# MiDaS Depth Estimation Example
+
+## Overview
+
+This example uses the [MiDaS v3.1 depth estimation](https://github.com/isl-org/MiDaS/)
+for computing depth in a single image.
+
+
+## Usage
+
+Make sure you have downloaded the data files first for the examples.
+You only need to do this once for all examples.
+
+```
+cd example/
+git clone --depth=1 https://github.com/swdee/go-rknnlite-data.git data
+```
+
+![bedroom.jpg](https://github.com/swdee/go-rknnlite-data/raw/master/bedroom.jpg)
+
+Run the MiDaS example on the above living room scene on rk3588 or replace with your Platform model.
+```
+cd example/midas
+go run midas.go -p rk3588
+```
+
+This will result in the output of:
+```
+Driver Version: 0.9.6, API Version: 2.3.0 (c949ad889d@2024-11-07T11:35:33)
+Model Input Number: 1, Output Number: 1
+Input tensors:
+  index=0, name=input, n_dims=4, dims=[1, 256, 256, 3], n_elems=196608, size=196608, fmt=NHWC, type=INT8, qnt_type=AFFINE, zp=0, scale=0.007843
+Output tensors:
+  index=0, name=depth, n_dims=4, dims=[1, 1, 256, 256], n_elems=65536, size=65536, fmt=NCHW, type=INT8, qnt_type=AFFINE, zp=-128, scale=19.864582
+Model first run speed: inference=577.442314ms, post processing=1.180646ms, rendering=3.137693ms, total time=581.760653ms
+Saved depth map result to ../data/bedroom-out.jpg
+Benchmark time=12.167970519s, count=20, average total time=608.398525ms
+done
+```
+
+The saved JPG image with depth estimation map.
+
+![midas-bedroom-out.jpg](https://github.com/swdee/go-rknnlite-data/raw/master/docimg/midas-bedroom-out.jpg)
+
+
+See the help for command line parameters.
+```
+$ go run midas.go -h
+
+Usage of /tmp/go-build2937772053/b001/exe/midas:
+  -i string
+        Image file to run depth estimation on (default "../data/bedroom.jpg")
+  -m string
+        RKNN compiled depth model file (default "../data/models/rk3588/dpt_swin2_tiny_256-rk3588.rknn")
+  -o string
+        Output JPG file (depth visualization) (default "../data/bedroom-out.jpg")
+  -p string
+        Rockchip platform [rk3562|rk3566|rk3568|rk3576|rk3582|rk3588] (default "rk3588")
+```
+
+
+
+
+### Docker
+
+To run the MiDaS example using the prebuilt docker image, make sure the data files have been downloaded first,
+then run.
+```
+# from project root directory
+
+docker run --rm \
+  --device /dev/dri:/dev/dri \
+  -v "$(pwd):/go/src/app" \
+  -v "$(pwd)/example/data:/go/src/data" \
+  -v "/usr/include/rknn_api.h:/usr/include/rknn_api.h" \
+  -v "/usr/lib/librknnrt.so:/usr/lib/librknnrt.so" \
+  -w /go/src/app \
+  swdee/go-rknnlite:latest \
+  go run ./example/midas/midas.go -p rk3588
+```
+
+
+## Benchmarks
+
+The following table shows a comparison of the benchmark results across the three distinct platforms.
+
+| Platform | Execution Time | Average Inference Time Per Image |
+|----------|----------------|----------------------------------|
+| rk3588   | 12.16s         | 608.39ms                         |
+| rk3576   | 16.85s         | 842.97ms                         |
+| rk3566   | 37.49s         | 1.87s                            
+
--- a/example/midas/midas.go
+++ b/example/midas/midas.go
@@ -0,0 +1,189 @@
+/*
+Example code showing how to perform depth estimation using a MiDaS model.
+*/
+package main
+
+import (
+	"flag"
+	"image"
+	"log"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/swdee/go-rknnlite"
+	"github.com/swdee/go-rknnlite/postprocess"
+	"gocv.io/x/gocv"
+)
+
+func main() {
+	// disable logging timestamps
+	log.SetFlags(0)
+
+	// read in cli flags
+	modelFile := flag.String("m", "../data/models/rk3588/dpt_swin2_tiny_256-rk3588.rknn", "RKNN compiled depth model file")
+	imgFile := flag.String("i", "../data/bedroom.jpg", "Image file to run depth estimation on")
+	saveFile := flag.String("o", "../data/bedroom-out.jpg", "Output JPG file (depth visualization)")
+	rkPlatform := flag.String("p", "rk3588", "Rockchip platform [rk3562|rk3566|rk3568|rk3576|rk3582|rk3588]")
+
+	flag.Parse()
+
+	err := rknnlite.SetCPUAffinityByPlatform(*rkPlatform, rknnlite.FastCores)
+
+	if err != nil {
+		log.Printf("Failed to set CPU affinity: %v\n", err)
+	}
+
+	// check if user specified model file or if default is being used.  if default
+	// then pick the default platform model to use.
+	if f := flag.Lookup("m"); f != nil && f.Value.String() == f.DefValue && *rkPlatform != "rk3588" {
+		*modelFile = strings.ReplaceAll(*modelFile, "rk3588", *rkPlatform)
+	}
+
+	// create rknn runtime instance
+	rt, err := rknnlite.NewRuntimeByPlatform(*rkPlatform, *modelFile)
+
+	if err != nil {
+		log.Fatal("Error initializing RKNN runtime: ", err)
+	}
+
+	// We want float32 outputs for easy depth visualization
+	rt.SetWantFloat(true)
+
+	// optional querying of model file tensors and SDK version for printing
+	// to stdout.  not necessary for production inference code
+	err = rt.Query(os.Stdout)
+
+	if err != nil {
+		log.Fatal("Error querying runtime: ", err)
+	}
+
+	// create midas post processor
+	midasProcessor := postprocess.NewMiDaS(postprocess.MiDaSDefaultParams())
+
+	// load image
+	img := gocv.IMRead(*imgFile, gocv.IMReadColor)
+
+	if img.Empty() {
+		log.Fatal("Error reading image from: ", *imgFile)
+	}
+
+	// convert colorspace and resize image to input tensor size
+	rgbImg := gocv.NewMat()
+	gocv.CvtColor(img, &rgbImg, gocv.ColorBGRToRGB)
+
+	cropImg := rgbImg.Clone()
+	scaleSize := image.Pt(int(rt.InputAttrs()[0].Dims[2]), int(rt.InputAttrs()[0].Dims[1]))
+	gocv.Resize(rgbImg, &cropImg, scaleSize, 0, 0, gocv.InterpolationArea)
+
+	defer img.Close()
+	defer rgbImg.Close()
+	defer cropImg.Close()
+
+	start := time.Now()
+
+	// perform inference on image file
+	outputs, err := rt.Inference([]gocv.Mat{cropImg})
+
+	if err != nil {
+		log.Fatal("Runtime inferencing failed with error: ", err)
+	}
+
+	endInference := time.Now()
+
+	//  post process and create depth map
+	depthMap := gocv.NewMat()
+	defer depthMap.Close()
+	err = midasProcessor.CreateDepthMap(outputs, depthMap)
+
+	if err != nil {
+		log.Fatal("Error creating depth map: ", err)
+	}
+
+	endCreateMap := time.Now()
+
+	// resize the color map back to the original input image size
+	resizedMap := gocv.NewMat()
+	defer resizedMap.Close()
+	gocv.Resize(depthMap, &resizedMap, image.Pt(img.Cols(), img.Rows()), 0, 0, gocv.InterpolationCubic)
+
+	endRendering := time.Now()
+
+	log.Printf("Model first run speed: inference=%s, post processing=%s, rendering=%s, total time=%s\n",
+		endInference.Sub(start).String(),
+		endCreateMap.Sub(endInference).String(),
+		endRendering.Sub(endCreateMap).String(),
+		endRendering.Sub(start).String(),
+	)
+
+	// Save the result
+	if ok := gocv.IMWrite(*saveFile, resizedMap); !ok {
+		log.Fatal("Failed to save the image")
+	}
+
+	log.Printf("Saved depth map result to %s\n", *saveFile)
+
+	// free outputs allocated in C memory after you have finished post processing
+	err = outputs.Free()
+
+	if err != nil {
+		log.Fatal("Error freeing Outputs: ", err)
+	}
+
+	// optional code.  run benchmark to get average time
+	runBenchmark(rt, midasProcessor, []gocv.Mat{cropImg}, img)
+
+	// close runtime and release resources
+	err = rt.Close()
+
+	if err != nil {
+		log.Fatal("Error closing RKNN runtime: ", err)
+	}
+
+	log.Println("done")
+}
+
+func runBenchmark(rt *rknnlite.Runtime, midasProcessor *postprocess.MiDaS,
+	mats []gocv.Mat, srcImg gocv.Mat) {
+
+	count := 20
+	start := time.Now()
+
+	depthMap := gocv.NewMat()
+	defer depthMap.Close()
+	resizedMap := gocv.NewMat()
+	defer resizedMap.Close()
+
+	for i := 0; i < count; i++ {
+		// perform inference on image file
+		outputs, err := rt.Inference(mats)
+
+		if err != nil {
+			log.Fatal("Runtime inferencing failed with error: ", err)
+		}
+
+		// post process
+		err = midasProcessor.CreateDepthMap(outputs, depthMap)
+
+		if err != nil {
+			log.Fatal("Error creating depth map: ", err)
+		}
+
+		// resize the color map back to the original input image size
+		gocv.Resize(depthMap, &resizedMap, image.Pt(srcImg.Cols(), srcImg.Rows()), 0, 0, gocv.InterpolationCubic)
+
+		err = outputs.Free()
+
+		if err != nil {
+			log.Fatal("Error freeing Outputs: ", err)
+		}
+	}
+
+	end := time.Now()
+	total := end.Sub(start)
+	avg := total / time.Duration(count)
+
+	log.Printf("Benchmark time=%s, count=%d, average total time=%s\n",
+		total.String(), count, avg.String(),
+	)
+}
--- a/postprocess/midas.go
+++ b/postprocess/midas.go
@@ -0,0 +1,170 @@
+package postprocess
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/swdee/go-rknnlite"
+	"gocv.io/x/gocv"
+)
+
+// MiDaS defines the struct for a MiDaS depth estimation inference post processing
+type MiDaS struct {
+	// Params are the depth map configuration parameters
+	Params MiDaSParams
+}
+
+// GrayscaleMap is used to not apply coloring to output depthmap, but to leave as grayscale
+const GrayscaleMap = gocv.ColormapTypes(9999)
+
+type MiDaSParams struct {
+	// Invert the depth map
+	Invert bool
+	// Colormap to apply to depth map, if you want it left as grayscale then
+	// pass postprocess.GrayscaleMap
+	Colormap gocv.ColormapTypes
+}
+
+// MiDaSDefaultParams sets output depth map to non-inverting and use Hot color scheme
+func MiDaSDefaultParams() MiDaSParams {
+	return MiDaSParams{
+		Invert:   false,
+		Colormap: gocv.ColormapHot,
+	}
+}
+
+// NewMiDaS returns and instance of the MiDaS post processor
+func NewMiDaS(p MiDaSParams) *MiDaS {
+	return &MiDaS{
+		Params: p,
+	}
+}
+
+// CreateDepthMap converts the tensor output data into a depth estimation map image
+func (m *MiDaS) CreateDepthMap(outputs *rknnlite.Outputs, depthMat gocv.Mat) error {
+
+	// output tensor is in NCHW format
+	// get output tensor width/height
+	outH := int(outputs.OutputAttributes().DimHeights[0])
+	outW := int(outputs.OutputAttributes().DimWidths[0])
+
+	// Convert float depth to uint8 visualization
+	depthU8 := m.depthToU8(outputs.Output[0].BufFloat, outH, outW)
+
+	// Make a Mat from bytes
+	u8Mat, err := gocv.NewMatFromBytes(outH, outW, gocv.MatTypeCV8U, depthU8)
+
+	if err != nil {
+		return fmt.Errorf("Failed to create depth mat: %v", err)
+	}
+
+	defer u8Mat.Close()
+
+	if m.Params.Colormap == GrayscaleMap {
+		// no coloring
+		u8Mat.CopyTo(&depthMat)
+
+	} else {
+		// apply colormap
+		gocv.ApplyColorMap(u8Mat, &depthMat, m.Params.Colormap)
+	}
+
+	return nil
+}
+
+// depthToU8 converts a float32 depth map into an 8-bit visualization image.
+//
+// MiDaS outputs “relative depth” values that are not bounded to [0,1] and
+// can vary per image. To visualize, we normalize the depth values to [0,255]
+// using the min/max over the whole output map.
+//
+// Output layout is row-major grayscale: out[y*w + x]
+func (m *MiDaS) depthToU8(depth []float32, h, w int) []byte {
+
+	total := h * w
+	out := make([]byte, total)
+
+	// First pass: find min/max depth ignoring NaN/Inf values
+	minV := float32(math.Inf(1))
+	maxV := float32(math.Inf(-1))
+
+	for y := 0; y < h; y++ {
+		for x := 0; x < w; x++ {
+			// Read the depth value at (y,x) from the model output buffer
+			v := m.getDepthAt(depth, y, x, h, w)
+
+			// Skip invalid floating-point values so they don't poison min/max
+			if !m.isFinite32(v) {
+				continue
+			}
+
+			if v < minV {
+				minV = v
+			}
+
+			if v > maxV {
+				maxV = v
+			}
+		}
+	}
+
+	// Guard against all-invalid outputs or a constant output (max==min)
+	den := maxV - minV
+	if !m.isFinite32(minV) || !m.isFinite32(maxV) || den <= 0 {
+		// Fallback: return all zeros (black image)
+		return out
+	}
+
+	// Second pass: normalize each pixel to [0,1], optionally invert, clamp, then scale to [0,255]
+	for y := 0; y < h; y++ {
+		for x := 0; x < w; x++ {
+			v := m.getDepthAt(depth, y, x, h, w)
+
+			// If this pixel is invalid, pin it to minV so it becomes black after normalization
+			if !m.isFinite32(v) {
+				v = minV
+			}
+
+			// Normalize to 0..1 based on the image's min/max range
+			n := (v - minV) / den
+
+			// Optional inversion for visualization (swap near/far appearance)
+			if m.Params.Invert {
+				n = 1.0 - n
+			}
+
+			// Clamp to [0,1] to avoid overflow/underflow due to outliers or rounding
+			if n < 0 {
+				n = 0
+			}
+			if n > 1 {
+				n = 1
+			}
+
+			// Convert to uint8 grayscale
+			out[y*w+x] = byte(n * 255.0)
+		}
+	}
+
+	return out
+}
+
+// getDepthAt returns the depth value at pixel coordinate (y,x) from the raw output buffer.
+// This function assumes the output tensor is laid out as NCHW
+func (m *MiDaS) getDepthAt(buf []float32, y, x, h, w int) float32 {
+
+	// index = ((n*C + ch)*H + y)*W + x ; n=0, ch=0
+	idx := (0*h+y)*w + x
+	if idx >= 0 && idx < len(buf) {
+		return buf[idx]
+	}
+
+	// Out-of-range access should never happen if h/w match the tensor dimensions
+	// Returning 0 is a safe fallback to avoid panics
+	return 0
+}
+
+// isFinite32 returns True if v is neither NaN nor +/-Inf
+func (m *MiDaS) isFinite32(v float32) bool {
+	return !math.IsNaN(float64(v)) && !math.IsInf(float64(v), 0)
+}