added code for PPOCR recognition

2025-09-26 19:31:12 +08:00 · 2024-05-16 21:39:10 +12:00
parent 62986d0404
commit eac2fd22dd
10 changed files with 479 additions and 14 deletions
--- a/example/ppocr-rec/README.md
+++ b/example/ppocr-rec/README.md
@@ -0,0 +1,55 @@
+# PaddleOCR (PPOCR)
+
+[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) provides multilingual
+OCR based on the PaddlePaddle lightweight OCR system, supporting recognition of
+80+ languages.
+
+## Usage
+
+Make sure you have downloaded the data files first for the examples.
+You only need to do this once for all examples.
+
+```
+cd example/
+git clone https://github.com/swdee/go-rknnlite-data.git data
+```
+
+Run the PPOCR Recognition example.
+```
+cd example/ppocr-rec
+go run ppocr-rec.go
+```
+
+This will result in the output of:
+```
+Driver Version: 0.8.2, API Version: 1.6.0 (9a7b5d24c@2023-12-13T17:31:11)
+Model Input Number: 1, Ouput Number: 1
+Input tensors:
+  index=0, name=x, n_dims=4, dims=[1, 48, 320, 3], n_elems=46080, size=92160, fmt=NHWC, type=FP16, qnt_type=AFFINE, zp=0, scale=1.000000
+Output tensors:
+  index=0, name=softmax_11.tmp_0, n_dims=3, dims=[1, 40, 6625, 0], n_elems=265000, size=530000, fmt=UNDEFINED, type=FP16, qnt_type=AFFINE, zp=0, scale=1.000000
+Model first run speed: inference=24.707428ms, post processing=478.906µs, total time=25.186334ms
+Recognize result: JOINT, score=0.71
+Benchmark time=321.330438ms, count=10, average total time=32.133043ms
+done
+```
+
+Sample images input and text detected.
+
+
+| Input Image                       | Text Recognised | Confidence Score |
+|-----------------------------------|-----------------|------------------|
+| ![joint.png](joint.png)           | JOINT           | 0.71             |
+| ![region.jpg](region.jpg)         |    浙G·Z6825        | 0.65         |
+| ![cn-text.png](cn-text.png)       |    中华老字号        | 0.71          |
+| ![mozzarella.jpg](mozzarella.jpg) |    MOZZARELLA - 188        | 0.67  |
+
+
+
+
+
+## Background
+
+This PPOCR example is a Go conversion of the [C API example](https://github.com/airockchip/rknn_model_zoo/blob/main/examples/PPOCR/PPOCR-Rec/cpp/main.cc).
+
+
--- a/example/ppocr-rec/cn-text.png
+++ b/example/ppocr-rec/cn-text.png
--- a/example/ppocr-rec/joint.png
+++ b/example/ppocr-rec/joint.png
--- a/example/ppocr-rec/mozzarella.jpg
+++ b/example/ppocr-rec/mozzarella.jpg
--- a/example/ppocr-rec/ppocr-rec.go
+++ b/example/ppocr-rec/ppocr-rec.go
@@ -0,0 +1,255 @@
+/*
+Example code showing how to perform OCR on an image using PaddleOCR recognition
+*/
+package main
+
+import (
+	"flag"
+	"fmt"
+	"github.com/swdee/go-rknnlite"
+	"github.com/swdee/go-rknnlite/postprocess"
+	"gocv.io/x/gocv"
+	"image"
+	"log"
+	"time"
+)
+
+func main() {
+	// disable logging timestamps
+	log.SetFlags(0)
+
+	// read in cli flags
+	modelFile := flag.String("m", "../data/ppocrv4_rec-rk3588.rknn", "RKNN compiled model file")
+	imgFile := flag.String("i", "../data/ppocr-rec-test.png", "Image file to run inference on")
+	keysFile := flag.String("k", "../data/ppocr_keys_v1.txt", "Text file containing OCR character keys")
+	flag.Parse()
+
+	// create rknn runtime instance
+	rt, err := rknnlite.NewRuntime(*modelFile, rknnlite.NPUCoreAuto)
+
+	if err != nil {
+		log.Fatal("Error initializing RKNN runtime: ", err)
+	}
+
+	// set runtime to pass input gocv.Mat's to Inference() function as float32
+	// to RKNN backend
+	rt.SetInputTypeFloat32(true)
+
+	// optional querying of model file tensors and SDK version.  not necessary
+	// for production inference code
+	inputAttrs, outputAttrs := optionalQueries(rt)
+
+	// load in Model character labels
+	modelChars, err := rknnlite.LoadLabels(*keysFile)
+
+	if err != nil {
+		log.Fatal("Error loading model OCR character keys: ", err)
+	}
+
+	// check that we have as many modelChars as tensor outputs dimension
+	if len(modelChars) != int(outputAttrs[0].Dims[2]) {
+		log.Fatalf("OCR character keys text input has %d characters and does "+
+			"not match the required number in the Model of %d",
+			len(modelChars), outputAttrs[0].Dims[2])
+	}
+
+	// create PPOCR post processor
+	ppocrProcessor := postprocess.NewPPOCR(postprocess.PPOCRParams{
+		ModelChars:   modelChars,
+		OutputSeqLen: int(inputAttrs[0].Dims[2]) / 8, // modelWidth (320/8)
+	})
+
+	// load image
+	img := gocv.IMRead(*imgFile, gocv.IMReadColor)
+
+	if img.Empty() {
+		log.Fatal("Error reading image from: ", *imgFile)
+	}
+
+	// resize image to 320x48 and keep aspect ratio, centered with black letterboxing
+	resizedImg := gocv.NewMat()
+	resizeKeepAspectRatio(img, &resizedImg, int(inputAttrs[0].Dims[2]), int(inputAttrs[0].Dims[1]))
+
+	// convert image to float32 in 3 channels
+	resizedImg.ConvertTo(&resizedImg, gocv.MatTypeCV32FC3)
+
+	// normalize the image (img - 127.5) / 127.5
+	resizedImg.AddFloat(-127.5)
+	resizedImg.DivideFloat(127.5)
+
+	defer img.Close()
+	defer resizedImg.Close()
+
+	start := time.Now()
+
+	// perform inference on image file
+	outputs, err := rt.Inference([]gocv.Mat{resizedImg})
+
+	if err != nil {
+		log.Fatal("Runtime inferencing failed with error: ", err)
+	}
+
+	endInference := time.Now()
+
+	results := ppocrProcessor.Recognise(outputs)
+
+	endRecognise := time.Now()
+
+	log.Printf("Model first run speed: inference=%s, post processing=%s, total time=%s\n",
+		endInference.Sub(start).String(),
+		endRecognise.Sub(endInference).String(),
+		endRecognise.Sub(start).String(),
+	)
+
+	for _, result := range results {
+		log.Printf("Recognize result: %s, score=%.2f", result.Text, result.Score)
+	}
+
+	// free outputs allocated in C memory after you have finished post processing
+	err = outputs.Free()
+
+	if err != nil {
+		log.Fatal("Error freeing Outputs: ", err)
+	}
+
+	// optional code.  run benchmark to get average time of 10 runs
+	runBenchmark(rt, ppocrProcessor, []gocv.Mat{resizedImg})
+
+	// close runtime and release resources
+	err = rt.Close()
+
+	if err != nil {
+		log.Fatal("Error closing RKNN runtime: ", err)
+	}
+
+	log.Println("done")
+}
+
+func runBenchmark(rt *rknnlite.Runtime, ppocrProcessor *postprocess.PPOCR,
+	mats []gocv.Mat) {
+
+	count := 100
+	start := time.Now()
+
+	for i := 0; i < count; i++ {
+		// perform inference on image file
+		outputs, err := rt.Inference(mats)
+
+		if err != nil {
+			log.Fatal("Runtime inferencing failed with error: ", err)
+		}
+
+		// post process
+		_ = ppocrProcessor.Recognise(outputs)
+
+		err = outputs.Free()
+
+		if err != nil {
+			log.Fatal("Error freeing Outputs: ", err)
+		}
+	}
+
+	end := time.Now()
+	total := end.Sub(start)
+	avg := total / time.Duration(count)
+
+	log.Printf("Benchmark time=%s, count=%d, average total time=%s\n",
+		total.String(), count, avg.String(),
+	)
+}
+
+// resizeKeepAspectRatio resizes an image to a desired width and height while
+// maintaining the aspect ratio. The resulting image is centered with black
+// letterboxing where necessary.
+func resizeKeepAspectRatio(srcImg gocv.Mat, dstImg *gocv.Mat, width, height int) {
+
+	// calculate the ratio of the original image
+	srcWidth := srcImg.Cols()
+	srcHeight := srcImg.Rows()
+	srcRatio := float64(srcWidth) / float64(srcHeight)
+
+	// calculate the ratio of the new dimensions
+	dstRatio := float64(width) / float64(height)
+
+	newWidth, newHeight := width, height
+
+	// adjust dimensions to maintain aspect ratio
+	if srcRatio > dstRatio {
+		newHeight = int(float64(width) / srcRatio)
+	} else {
+		newWidth = int(float64(height) * srcRatio)
+	}
+
+	// resize the original image to the new size that fits within the desired dimensions
+	resizedImg := gocv.NewMat()
+	gocv.Resize(srcImg, &resizedImg, image.Pt(newWidth, newHeight), 0, 0, gocv.InterpolationLinear)
+	defer resizedImg.Close()
+
+	// ensure destination Mat is the correct size and type
+	if dstImg.Empty() {
+		*dstImg = gocv.NewMatWithSize(height, width, gocv.MatTypeCV8UC3)
+	}
+
+	// create a black image
+	dstImg.SetTo(gocv.NewScalar(0, 0, 0, 0))
+
+	// find the top-left corner coordinates to center the resized image
+	//x := (width - newWidth) / 2
+	y := (height - newHeight) / 2
+	x := 0
+
+	// define a region of interest (ROI) within the final image where the
+	// resized image will be placed
+	roi := dstImg.Region(image.Rect(x, y, x+newWidth, y+newHeight))
+	resizedImg.CopyTo(&roi)
+	roi.Close()
+}
+
+func optionalQueries(rt *rknnlite.Runtime) ([]rknnlite.TensorAttr, []rknnlite.TensorAttr) {
+
+	// get SDK version
+	ver, err := rt.SDKVersion()
+
+	if err != nil {
+		log.Fatal("Error initializing RKNN runtime: ", err)
+	}
+
+	fmt.Printf("Driver Version: %s, API Version: %s\n", ver.DriverVersion, ver.APIVersion)
+
+	// get model input and output numbers
+	num, err := rt.QueryModelIONumber()
+
+	if err != nil {
+		log.Fatal("Error querying IO Numbers: ", err)
+	}
+
+	log.Printf("Model Input Number: %d, Ouput Number: %d\n", num.NumberInput, num.NumberOutput)
+
+	// query Input tensors
+	inputAttrs, err := rt.QueryInputTensors()
+
+	if err != nil {
+		log.Fatal("Error querying Input Tensors: ", err)
+	}
+
+	log.Println("Input tensors:")
+
+	for _, attr := range inputAttrs {
+		log.Printf("  %s\n", attr.String())
+	}
+
+	// query Output tensors
+	outputAttrs, err := rt.QueryOutputTensors()
+
+	if err != nil {
+		log.Fatal("Error querying Output Tensors: ", err)
+	}
+
+	log.Println("Output tensors:")
+
+	for _, attr := range outputAttrs {
+		log.Printf("  %s\n", attr.String())
+	}
+
+	return inputAttrs, outputAttrs
+}
--- a/example/ppocr-rec/region.jpg
+++ b/example/ppocr-rec/region.jpg
--- a/inference.go
+++ b/inference.go
@@ -3,6 +3,7 @@ package rknnlite
 /*
 #include "rknn_api.h"
 #include <stdlib.h>
+#include <string.h>
 */
 import "C"
 import (
@@ -47,22 +48,40 @@ func (r *Runtime) Inference(mats []gocv.Mat) (*Outputs, error) {
 			mat = mat.Clone()
 		}

-		// cast to float32, as PassThrough below is set to false then RKNN
-		// will convert the input values to that of the tensor inputs in the model,
-		// eg: INT8
-		data, err := mat.DataPtrUint8()
+		if r.inputTypeFloat32 {
+			// pass data as float32 to RKNN backend
+			data, err := mat.DataPtrFloat32()

-		if err != nil {
-			return &Outputs{}, fmt.Errorf("error converting image to float32: %w", err)
-		}
+			if err != nil {
+				return &Outputs{}, fmt.Errorf("error getting data pointer to Mat: %w", err)
+			}

-		inputs[idx] = Input{
-			Index:       uint32(idx),
-			Type:        TensorUint8,
-			Size:        uint32(mat.Cols() * mat.Rows() * mat.Channels()),
-			Fmt:         TensorNHWC,
-			Buf:         unsafe.Pointer(&data[0]),
-			PassThrough: false,
+			inputs[idx] = Input{
+				Index: uint32(idx),
+				Type:  TensorFloat32,
+				// multiply by 4 for size of float32
+				Size:        uint32(mat.Cols() * mat.Rows() * mat.Channels() * 4),
+				Fmt:         TensorNHWC,
+				Buf:         unsafe.Pointer(&data[0]),
+				PassThrough: false,
+			}
+
+		} else {
+			// pass data as uint8 to RKNN backend
+			data, err := mat.DataPtrUint8()
+
+			if err != nil {
+				return &Outputs{}, fmt.Errorf("error getting data pointer to Mat: %w", err)
+			}
+
+			inputs[idx] = Input{
+				Index:       uint32(idx),
+				Type:        TensorUint8,
+				Size:        uint32(mat.Cols() * mat.Rows() * mat.Channels()),
+				Fmt:         TensorNHWC,
+				Buf:         unsafe.Pointer(&data[0]),
+				PassThrough: false,
+			}
 		}
 	}

--- a/labels.go
+++ b/labels.go
@@ -28,6 +28,12 @@ func LoadLabels(file string) ([]string, error) {
 	// read and trim each line
 	for scanner.Scan() {
 		line := strings.TrimSpace(scanner.Text())
+
+		// handle special keyword to convert to " " this is needed for
+		// PPOCR key list
+		if line == "__space__" {
+			line = " "
+		}
 		labels = append(labels, line)
 	}

--- a/postprocess/ppocr.go
+++ b/postprocess/ppocr.go
@@ -0,0 +1,120 @@
+package postprocess
+
+import (
+	"fmt"
+	"github.com/swdee/go-rknnlite"
+	"math"
+)
+
+// PPOCR defines the struct for the PPOCR model inference post processing
+type PPOCR struct {
+	Params PPOCRParams
+}
+
+// PPOCRParams defines the struct containing the PPOCR parameters to use for
+// post processing operations
+type PPOCRParams struct {
+	// ModelChars is the list of characters used to train the PPOCR model
+	ModelChars []string
+	// numChars is the number of characters in ModelChars
+	numChar int
+	// OutputSeqLen is the length of sequence output data from the OCR model
+	OutputSeqLen int
+}
+
+// RecogniseResult is a text result recognised by OCR
+type RecogniseResult struct {
+	// Text is the recognised text
+	Text string
+	// Score is the confidence score of the text recognised
+	Score float32
+}
+
+// NewPPOCR returns an instance of the PPOCR post processor
+func NewPPOCR(param PPOCRParams) *PPOCR {
+	p := &PPOCR{
+		Params: param,
+	}
+
+	p.Params.numChar = len(param.ModelChars)
+
+	return p
+}
+
+// Recognise takes the RKNN outputs and converts them to text
+func (p *PPOCR) Recognise(outputs *rknnlite.Outputs) []RecogniseResult {
+
+	results := make([]RecogniseResult, len(outputs.Output))
+
+	for idx, output := range outputs.Output {
+		rec, err := p.recogniseText(output)
+
+		if err != nil {
+			results[idx] = RecogniseResult{
+				Text:  "ERROR ModelChars",
+				Score: 0,
+			}
+		} else {
+			results[idx] = rec
+		}
+	}
+
+	return results
+}
+
+// recogniseText takes a single RKNN Output and returns the OCR'd text as string
+func (p *PPOCR) recogniseText(output rknnlite.Output) (RecogniseResult, error) {
+
+	res := RecogniseResult{}
+
+	var argmaxVal float32
+	var argmaxIdx, lastIdx, count int
+
+	for n := 0; n < p.Params.OutputSeqLen; n++ {
+
+		offset := n * p.Params.numChar
+		argmaxIdx, argmaxVal = p.argMax(output.BufFloat[offset : offset+p.Params.numChar])
+
+		if argmaxIdx > 0 && !(n > 0 && argmaxIdx == lastIdx) {
+			// add to score max value
+			res.Score += argmaxVal
+			count++
+
+			if argmaxIdx > p.Params.numChar {
+				return RecogniseResult{}, fmt.Errorf("output index is larger than size of ModelChars list")
+			}
+
+			res.Text += p.Params.ModelChars[argmaxIdx]
+		}
+
+		lastIdx = argmaxIdx
+	}
+
+	res.Score /= float32(count) + 1e-6
+
+	if count == 0 || math.IsNaN(float64(res.Score)) {
+		res.Score = 0.0
+	}
+
+	return res, nil
+}
+
+// argMax returns the index of the maximum element in a slice
+func (p *PPOCR) argMax(slice []float32) (int, float32) {
+
+	if len(slice) == 0 {
+		return 0, 0
+	}
+
+	maxIdx := 0
+	maxValue := slice[0]
+
+	for i, value := range slice {
+		if value > maxValue {
+			maxValue = value
+			maxIdx = i
+		}
+	}
+
+	return maxIdx, maxValue
+}
--- a/runtime.go
+++ b/runtime.go
@@ -99,6 +99,9 @@ type Runtime struct {
 	// wantFloat indicates if Outputs are converted to float32 or left as int8.
 	// default option is True
 	wantFloat bool
+	// inputTypeFloat32 indicates if we pass the input gocv.Mat's data as float32
+	// to the RKNN backend
+	inputTypeFloat32 bool
 }

 // NewRuntime returns a RKNN run time instance.  Provide the full path and
@@ -211,6 +214,13 @@ func (r *Runtime) SetWantFloat(val bool) {
 	r.wantFloat = val
 }

+// SetInputTypeFloat32 defines if the Model requires the Inference() function to
+// pass the gocv.Mat's as float32 data to RKNN backend.  Setting this overrides
+// the default behaviour to pass gocv.Mat data as Uint8
+func (r *Runtime) SetInputTypeFloat32(val bool) {
+	r.inputTypeFloat32 = val
+}
+
 // SDKVersion represents the C.rknn_sdk_version struct
 type SDKVersion struct {
 	DriverVersion string