added code for PPOCR recognition

This commit is contained in:
swdee
2024-05-16 21:39:10 +12:00
parent 62986d0404
commit eac2fd22dd
10 changed files with 479 additions and 14 deletions

View File

@@ -0,0 +1,55 @@
# PaddleOCR (PPOCR)
[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) provides multilingual
OCR based on the PaddlePaddle lightweight OCR system, supporting recognition of
80+ languages.
## Usage
Make sure you have downloaded the data files first for the examples.
You only need to do this once for all examples.
```
cd example/
git clone https://github.com/swdee/go-rknnlite-data.git data
```
Run the PPOCR Recognition example.
```
cd example/ppocr-rec
go run ppocr-rec.go
```
This will result in the output of:
```
Driver Version: 0.8.2, API Version: 1.6.0 (9a7b5d24c@2023-12-13T17:31:11)
Model Input Number: 1, Ouput Number: 1
Input tensors:
index=0, name=x, n_dims=4, dims=[1, 48, 320, 3], n_elems=46080, size=92160, fmt=NHWC, type=FP16, qnt_type=AFFINE, zp=0, scale=1.000000
Output tensors:
index=0, name=softmax_11.tmp_0, n_dims=3, dims=[1, 40, 6625, 0], n_elems=265000, size=530000, fmt=UNDEFINED, type=FP16, qnt_type=AFFINE, zp=0, scale=1.000000
Model first run speed: inference=24.707428ms, post processing=478.906µs, total time=25.186334ms
Recognize result: JOINT, score=0.71
Benchmark time=321.330438ms, count=10, average total time=32.133043ms
done
```
Sample images input and text detected.
| Input Image | Text Recognised | Confidence Score |
|-----------------------------------|-----------------|------------------|
| ![joint.png](joint.png) | JOINT | 0.71 |
| ![region.jpg](region.jpg) | 浙G·Z6825 | 0.65 |
| ![cn-text.png](cn-text.png) | 中华老字号 | 0.71 |
| ![mozzarella.jpg](mozzarella.jpg) | MOZZARELLA - 188 | 0.67 |
## Background
This PPOCR example is a Go conversion of the [C API example](https://github.com/airockchip/rknn_model_zoo/blob/main/examples/PPOCR/PPOCR-Rec/cpp/main.cc).

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

BIN
example/ppocr-rec/joint.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

View File

@@ -0,0 +1,255 @@
/*
Example code showing how to perform OCR on an image using PaddleOCR recognition
*/
package main
import (
"flag"
"fmt"
"github.com/swdee/go-rknnlite"
"github.com/swdee/go-rknnlite/postprocess"
"gocv.io/x/gocv"
"image"
"log"
"time"
)
func main() {
// disable logging timestamps
log.SetFlags(0)
// read in cli flags
modelFile := flag.String("m", "../data/ppocrv4_rec-rk3588.rknn", "RKNN compiled model file")
imgFile := flag.String("i", "../data/ppocr-rec-test.png", "Image file to run inference on")
keysFile := flag.String("k", "../data/ppocr_keys_v1.txt", "Text file containing OCR character keys")
flag.Parse()
// create rknn runtime instance
rt, err := rknnlite.NewRuntime(*modelFile, rknnlite.NPUCoreAuto)
if err != nil {
log.Fatal("Error initializing RKNN runtime: ", err)
}
// set runtime to pass input gocv.Mat's to Inference() function as float32
// to RKNN backend
rt.SetInputTypeFloat32(true)
// optional querying of model file tensors and SDK version. not necessary
// for production inference code
inputAttrs, outputAttrs := optionalQueries(rt)
// load in Model character labels
modelChars, err := rknnlite.LoadLabels(*keysFile)
if err != nil {
log.Fatal("Error loading model OCR character keys: ", err)
}
// check that we have as many modelChars as tensor outputs dimension
if len(modelChars) != int(outputAttrs[0].Dims[2]) {
log.Fatalf("OCR character keys text input has %d characters and does "+
"not match the required number in the Model of %d",
len(modelChars), outputAttrs[0].Dims[2])
}
// create PPOCR post processor
ppocrProcessor := postprocess.NewPPOCR(postprocess.PPOCRParams{
ModelChars: modelChars,
OutputSeqLen: int(inputAttrs[0].Dims[2]) / 8, // modelWidth (320/8)
})
// load image
img := gocv.IMRead(*imgFile, gocv.IMReadColor)
if img.Empty() {
log.Fatal("Error reading image from: ", *imgFile)
}
// resize image to 320x48 and keep aspect ratio, centered with black letterboxing
resizedImg := gocv.NewMat()
resizeKeepAspectRatio(img, &resizedImg, int(inputAttrs[0].Dims[2]), int(inputAttrs[0].Dims[1]))
// convert image to float32 in 3 channels
resizedImg.ConvertTo(&resizedImg, gocv.MatTypeCV32FC3)
// normalize the image (img - 127.5) / 127.5
resizedImg.AddFloat(-127.5)
resizedImg.DivideFloat(127.5)
defer img.Close()
defer resizedImg.Close()
start := time.Now()
// perform inference on image file
outputs, err := rt.Inference([]gocv.Mat{resizedImg})
if err != nil {
log.Fatal("Runtime inferencing failed with error: ", err)
}
endInference := time.Now()
results := ppocrProcessor.Recognise(outputs)
endRecognise := time.Now()
log.Printf("Model first run speed: inference=%s, post processing=%s, total time=%s\n",
endInference.Sub(start).String(),
endRecognise.Sub(endInference).String(),
endRecognise.Sub(start).String(),
)
for _, result := range results {
log.Printf("Recognize result: %s, score=%.2f", result.Text, result.Score)
}
// free outputs allocated in C memory after you have finished post processing
err = outputs.Free()
if err != nil {
log.Fatal("Error freeing Outputs: ", err)
}
// optional code. run benchmark to get average time of 10 runs
runBenchmark(rt, ppocrProcessor, []gocv.Mat{resizedImg})
// close runtime and release resources
err = rt.Close()
if err != nil {
log.Fatal("Error closing RKNN runtime: ", err)
}
log.Println("done")
}
func runBenchmark(rt *rknnlite.Runtime, ppocrProcessor *postprocess.PPOCR,
mats []gocv.Mat) {
count := 100
start := time.Now()
for i := 0; i < count; i++ {
// perform inference on image file
outputs, err := rt.Inference(mats)
if err != nil {
log.Fatal("Runtime inferencing failed with error: ", err)
}
// post process
_ = ppocrProcessor.Recognise(outputs)
err = outputs.Free()
if err != nil {
log.Fatal("Error freeing Outputs: ", err)
}
}
end := time.Now()
total := end.Sub(start)
avg := total / time.Duration(count)
log.Printf("Benchmark time=%s, count=%d, average total time=%s\n",
total.String(), count, avg.String(),
)
}
// resizeKeepAspectRatio resizes an image to a desired width and height while
// maintaining the aspect ratio. The resulting image is centered with black
// letterboxing where necessary.
func resizeKeepAspectRatio(srcImg gocv.Mat, dstImg *gocv.Mat, width, height int) {
// calculate the ratio of the original image
srcWidth := srcImg.Cols()
srcHeight := srcImg.Rows()
srcRatio := float64(srcWidth) / float64(srcHeight)
// calculate the ratio of the new dimensions
dstRatio := float64(width) / float64(height)
newWidth, newHeight := width, height
// adjust dimensions to maintain aspect ratio
if srcRatio > dstRatio {
newHeight = int(float64(width) / srcRatio)
} else {
newWidth = int(float64(height) * srcRatio)
}
// resize the original image to the new size that fits within the desired dimensions
resizedImg := gocv.NewMat()
gocv.Resize(srcImg, &resizedImg, image.Pt(newWidth, newHeight), 0, 0, gocv.InterpolationLinear)
defer resizedImg.Close()
// ensure destination Mat is the correct size and type
if dstImg.Empty() {
*dstImg = gocv.NewMatWithSize(height, width, gocv.MatTypeCV8UC3)
}
// create a black image
dstImg.SetTo(gocv.NewScalar(0, 0, 0, 0))
// find the top-left corner coordinates to center the resized image
//x := (width - newWidth) / 2
y := (height - newHeight) / 2
x := 0
// define a region of interest (ROI) within the final image where the
// resized image will be placed
roi := dstImg.Region(image.Rect(x, y, x+newWidth, y+newHeight))
resizedImg.CopyTo(&roi)
roi.Close()
}
func optionalQueries(rt *rknnlite.Runtime) ([]rknnlite.TensorAttr, []rknnlite.TensorAttr) {
// get SDK version
ver, err := rt.SDKVersion()
if err != nil {
log.Fatal("Error initializing RKNN runtime: ", err)
}
fmt.Printf("Driver Version: %s, API Version: %s\n", ver.DriverVersion, ver.APIVersion)
// get model input and output numbers
num, err := rt.QueryModelIONumber()
if err != nil {
log.Fatal("Error querying IO Numbers: ", err)
}
log.Printf("Model Input Number: %d, Ouput Number: %d\n", num.NumberInput, num.NumberOutput)
// query Input tensors
inputAttrs, err := rt.QueryInputTensors()
if err != nil {
log.Fatal("Error querying Input Tensors: ", err)
}
log.Println("Input tensors:")
for _, attr := range inputAttrs {
log.Printf(" %s\n", attr.String())
}
// query Output tensors
outputAttrs, err := rt.QueryOutputTensors()
if err != nil {
log.Fatal("Error querying Output Tensors: ", err)
}
log.Println("Output tensors:")
for _, attr := range outputAttrs {
log.Printf(" %s\n", attr.String())
}
return inputAttrs, outputAttrs
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -3,6 +3,7 @@ package rknnlite
/*
#include "rknn_api.h"
#include <stdlib.h>
#include <string.h>
*/
import "C"
import (
@@ -47,22 +48,40 @@ func (r *Runtime) Inference(mats []gocv.Mat) (*Outputs, error) {
mat = mat.Clone()
}
// cast to float32, as PassThrough below is set to false then RKNN
// will convert the input values to that of the tensor inputs in the model,
// eg: INT8
data, err := mat.DataPtrUint8()
if r.inputTypeFloat32 {
// pass data as float32 to RKNN backend
data, err := mat.DataPtrFloat32()
if err != nil {
return &Outputs{}, fmt.Errorf("error converting image to float32: %w", err)
}
if err != nil {
return &Outputs{}, fmt.Errorf("error getting data pointer to Mat: %w", err)
}
inputs[idx] = Input{
Index: uint32(idx),
Type: TensorUint8,
Size: uint32(mat.Cols() * mat.Rows() * mat.Channels()),
Fmt: TensorNHWC,
Buf: unsafe.Pointer(&data[0]),
PassThrough: false,
inputs[idx] = Input{
Index: uint32(idx),
Type: TensorFloat32,
// multiply by 4 for size of float32
Size: uint32(mat.Cols() * mat.Rows() * mat.Channels() * 4),
Fmt: TensorNHWC,
Buf: unsafe.Pointer(&data[0]),
PassThrough: false,
}
} else {
// pass data as uint8 to RKNN backend
data, err := mat.DataPtrUint8()
if err != nil {
return &Outputs{}, fmt.Errorf("error getting data pointer to Mat: %w", err)
}
inputs[idx] = Input{
Index: uint32(idx),
Type: TensorUint8,
Size: uint32(mat.Cols() * mat.Rows() * mat.Channels()),
Fmt: TensorNHWC,
Buf: unsafe.Pointer(&data[0]),
PassThrough: false,
}
}
}

View File

@@ -28,6 +28,12 @@ func LoadLabels(file string) ([]string, error) {
// read and trim each line
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// handle special keyword to convert to " " this is needed for
// PPOCR key list
if line == "__space__" {
line = " "
}
labels = append(labels, line)
}

120
postprocess/ppocr.go Normal file
View File

@@ -0,0 +1,120 @@
package postprocess
import (
"fmt"
"github.com/swdee/go-rknnlite"
"math"
)
// PPOCR defines the struct for the PPOCR model inference post processing
type PPOCR struct {
Params PPOCRParams
}
// PPOCRParams defines the struct containing the PPOCR parameters to use for
// post processing operations
type PPOCRParams struct {
// ModelChars is the list of characters used to train the PPOCR model
ModelChars []string
// numChars is the number of characters in ModelChars
numChar int
// OutputSeqLen is the length of sequence output data from the OCR model
OutputSeqLen int
}
// RecogniseResult is a text result recognised by OCR
type RecogniseResult struct {
// Text is the recognised text
Text string
// Score is the confidence score of the text recognised
Score float32
}
// NewPPOCR returns an instance of the PPOCR post processor
func NewPPOCR(param PPOCRParams) *PPOCR {
p := &PPOCR{
Params: param,
}
p.Params.numChar = len(param.ModelChars)
return p
}
// Recognise takes the RKNN outputs and converts them to text
func (p *PPOCR) Recognise(outputs *rknnlite.Outputs) []RecogniseResult {
results := make([]RecogniseResult, len(outputs.Output))
for idx, output := range outputs.Output {
rec, err := p.recogniseText(output)
if err != nil {
results[idx] = RecogniseResult{
Text: "ERROR ModelChars",
Score: 0,
}
} else {
results[idx] = rec
}
}
return results
}
// recogniseText takes a single RKNN Output and returns the OCR'd text as string
func (p *PPOCR) recogniseText(output rknnlite.Output) (RecogniseResult, error) {
res := RecogniseResult{}
var argmaxVal float32
var argmaxIdx, lastIdx, count int
for n := 0; n < p.Params.OutputSeqLen; n++ {
offset := n * p.Params.numChar
argmaxIdx, argmaxVal = p.argMax(output.BufFloat[offset : offset+p.Params.numChar])
if argmaxIdx > 0 && !(n > 0 && argmaxIdx == lastIdx) {
// add to score max value
res.Score += argmaxVal
count++
if argmaxIdx > p.Params.numChar {
return RecogniseResult{}, fmt.Errorf("output index is larger than size of ModelChars list")
}
res.Text += p.Params.ModelChars[argmaxIdx]
}
lastIdx = argmaxIdx
}
res.Score /= float32(count) + 1e-6
if count == 0 || math.IsNaN(float64(res.Score)) {
res.Score = 0.0
}
return res, nil
}
// argMax returns the index of the maximum element in a slice
func (p *PPOCR) argMax(slice []float32) (int, float32) {
if len(slice) == 0 {
return 0, 0
}
maxIdx := 0
maxValue := slice[0]
for i, value := range slice {
if value > maxValue {
maxValue = value
maxIdx = i
}
}
return maxIdx, maxValue
}

View File

@@ -99,6 +99,9 @@ type Runtime struct {
// wantFloat indicates if Outputs are converted to float32 or left as int8.
// default option is True
wantFloat bool
// inputTypeFloat32 indicates if we pass the input gocv.Mat's data as float32
// to the RKNN backend
inputTypeFloat32 bool
}
// NewRuntime returns a RKNN run time instance. Provide the full path and
@@ -211,6 +214,13 @@ func (r *Runtime) SetWantFloat(val bool) {
r.wantFloat = val
}
// SetInputTypeFloat32 defines if the Model requires the Inference() function to
// pass the gocv.Mat's as float32 data to RKNN backend. Setting this overrides
// the default behaviour to pass gocv.Mat data as Uint8
func (r *Runtime) SetInputTypeFloat32(val bool) {
r.inputTypeFloat32 = val
}
// SDKVersion represents the C.rknn_sdk_version struct
type SDKVersion struct {
DriverVersion string