mirror of
https://github.com/swdee/go-rknnlite.git
synced 2025-09-26 19:31:12 +08:00
added code for PPOCR recognition
This commit is contained in:
55
example/ppocr-rec/README.md
Normal file
55
example/ppocr-rec/README.md
Normal file
@@ -0,0 +1,55 @@
|
||||
# PaddleOCR (PPOCR)
|
||||
|
||||
[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) provides multilingual
|
||||
OCR based on the PaddlePaddle lightweight OCR system, supporting recognition of
|
||||
80+ languages.
|
||||
|
||||
## Usage
|
||||
|
||||
Make sure you have downloaded the data files first for the examples.
|
||||
You only need to do this once for all examples.
|
||||
|
||||
```
|
||||
cd example/
|
||||
git clone https://github.com/swdee/go-rknnlite-data.git data
|
||||
```
|
||||
|
||||
Run the PPOCR Recognition example.
|
||||
```
|
||||
cd example/ppocr-rec
|
||||
go run ppocr-rec.go
|
||||
```
|
||||
|
||||
This will result in the output of:
|
||||
```
|
||||
Driver Version: 0.8.2, API Version: 1.6.0 (9a7b5d24c@2023-12-13T17:31:11)
|
||||
Model Input Number: 1, Ouput Number: 1
|
||||
Input tensors:
|
||||
index=0, name=x, n_dims=4, dims=[1, 48, 320, 3], n_elems=46080, size=92160, fmt=NHWC, type=FP16, qnt_type=AFFINE, zp=0, scale=1.000000
|
||||
Output tensors:
|
||||
index=0, name=softmax_11.tmp_0, n_dims=3, dims=[1, 40, 6625, 0], n_elems=265000, size=530000, fmt=UNDEFINED, type=FP16, qnt_type=AFFINE, zp=0, scale=1.000000
|
||||
Model first run speed: inference=24.707428ms, post processing=478.906µs, total time=25.186334ms
|
||||
Recognize result: JOINT, score=0.71
|
||||
Benchmark time=321.330438ms, count=10, average total time=32.133043ms
|
||||
done
|
||||
```
|
||||
|
||||
Sample images input and text detected.
|
||||
|
||||
|
||||
| Input Image | Text Recognised | Confidence Score |
|
||||
|-----------------------------------|-----------------|------------------|
|
||||
|  | JOINT | 0.71 |
|
||||
|  | 浙G·Z6825 | 0.65 |
|
||||
|  | 中华老字号 | 0.71 |
|
||||
|  | MOZZARELLA - 188 | 0.67 |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Background
|
||||
|
||||
This PPOCR example is a Go conversion of the [C API example](https://github.com/airockchip/rknn_model_zoo/blob/main/examples/PPOCR/PPOCR-Rec/cpp/main.cc).
|
||||
|
||||
|
BIN
example/ppocr-rec/cn-text.png
Normal file
BIN
example/ppocr-rec/cn-text.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
BIN
example/ppocr-rec/joint.png
Normal file
BIN
example/ppocr-rec/joint.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 10 KiB |
BIN
example/ppocr-rec/mozzarella.jpg
Normal file
BIN
example/ppocr-rec/mozzarella.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 14 KiB |
255
example/ppocr-rec/ppocr-rec.go
Normal file
255
example/ppocr-rec/ppocr-rec.go
Normal file
@@ -0,0 +1,255 @@
|
||||
/*
|
||||
Example code showing how to perform OCR on an image using PaddleOCR recognition
|
||||
*/
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"github.com/swdee/go-rknnlite"
|
||||
"github.com/swdee/go-rknnlite/postprocess"
|
||||
"gocv.io/x/gocv"
|
||||
"image"
|
||||
"log"
|
||||
"time"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// disable logging timestamps
|
||||
log.SetFlags(0)
|
||||
|
||||
// read in cli flags
|
||||
modelFile := flag.String("m", "../data/ppocrv4_rec-rk3588.rknn", "RKNN compiled model file")
|
||||
imgFile := flag.String("i", "../data/ppocr-rec-test.png", "Image file to run inference on")
|
||||
keysFile := flag.String("k", "../data/ppocr_keys_v1.txt", "Text file containing OCR character keys")
|
||||
flag.Parse()
|
||||
|
||||
// create rknn runtime instance
|
||||
rt, err := rknnlite.NewRuntime(*modelFile, rknnlite.NPUCoreAuto)
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Error initializing RKNN runtime: ", err)
|
||||
}
|
||||
|
||||
// set runtime to pass input gocv.Mat's to Inference() function as float32
|
||||
// to RKNN backend
|
||||
rt.SetInputTypeFloat32(true)
|
||||
|
||||
// optional querying of model file tensors and SDK version. not necessary
|
||||
// for production inference code
|
||||
inputAttrs, outputAttrs := optionalQueries(rt)
|
||||
|
||||
// load in Model character labels
|
||||
modelChars, err := rknnlite.LoadLabels(*keysFile)
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Error loading model OCR character keys: ", err)
|
||||
}
|
||||
|
||||
// check that we have as many modelChars as tensor outputs dimension
|
||||
if len(modelChars) != int(outputAttrs[0].Dims[2]) {
|
||||
log.Fatalf("OCR character keys text input has %d characters and does "+
|
||||
"not match the required number in the Model of %d",
|
||||
len(modelChars), outputAttrs[0].Dims[2])
|
||||
}
|
||||
|
||||
// create PPOCR post processor
|
||||
ppocrProcessor := postprocess.NewPPOCR(postprocess.PPOCRParams{
|
||||
ModelChars: modelChars,
|
||||
OutputSeqLen: int(inputAttrs[0].Dims[2]) / 8, // modelWidth (320/8)
|
||||
})
|
||||
|
||||
// load image
|
||||
img := gocv.IMRead(*imgFile, gocv.IMReadColor)
|
||||
|
||||
if img.Empty() {
|
||||
log.Fatal("Error reading image from: ", *imgFile)
|
||||
}
|
||||
|
||||
// resize image to 320x48 and keep aspect ratio, centered with black letterboxing
|
||||
resizedImg := gocv.NewMat()
|
||||
resizeKeepAspectRatio(img, &resizedImg, int(inputAttrs[0].Dims[2]), int(inputAttrs[0].Dims[1]))
|
||||
|
||||
// convert image to float32 in 3 channels
|
||||
resizedImg.ConvertTo(&resizedImg, gocv.MatTypeCV32FC3)
|
||||
|
||||
// normalize the image (img - 127.5) / 127.5
|
||||
resizedImg.AddFloat(-127.5)
|
||||
resizedImg.DivideFloat(127.5)
|
||||
|
||||
defer img.Close()
|
||||
defer resizedImg.Close()
|
||||
|
||||
start := time.Now()
|
||||
|
||||
// perform inference on image file
|
||||
outputs, err := rt.Inference([]gocv.Mat{resizedImg})
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Runtime inferencing failed with error: ", err)
|
||||
}
|
||||
|
||||
endInference := time.Now()
|
||||
|
||||
results := ppocrProcessor.Recognise(outputs)
|
||||
|
||||
endRecognise := time.Now()
|
||||
|
||||
log.Printf("Model first run speed: inference=%s, post processing=%s, total time=%s\n",
|
||||
endInference.Sub(start).String(),
|
||||
endRecognise.Sub(endInference).String(),
|
||||
endRecognise.Sub(start).String(),
|
||||
)
|
||||
|
||||
for _, result := range results {
|
||||
log.Printf("Recognize result: %s, score=%.2f", result.Text, result.Score)
|
||||
}
|
||||
|
||||
// free outputs allocated in C memory after you have finished post processing
|
||||
err = outputs.Free()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Error freeing Outputs: ", err)
|
||||
}
|
||||
|
||||
// optional code. run benchmark to get average time of 10 runs
|
||||
runBenchmark(rt, ppocrProcessor, []gocv.Mat{resizedImg})
|
||||
|
||||
// close runtime and release resources
|
||||
err = rt.Close()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Error closing RKNN runtime: ", err)
|
||||
}
|
||||
|
||||
log.Println("done")
|
||||
}
|
||||
|
||||
func runBenchmark(rt *rknnlite.Runtime, ppocrProcessor *postprocess.PPOCR,
|
||||
mats []gocv.Mat) {
|
||||
|
||||
count := 100
|
||||
start := time.Now()
|
||||
|
||||
for i := 0; i < count; i++ {
|
||||
// perform inference on image file
|
||||
outputs, err := rt.Inference(mats)
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Runtime inferencing failed with error: ", err)
|
||||
}
|
||||
|
||||
// post process
|
||||
_ = ppocrProcessor.Recognise(outputs)
|
||||
|
||||
err = outputs.Free()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Error freeing Outputs: ", err)
|
||||
}
|
||||
}
|
||||
|
||||
end := time.Now()
|
||||
total := end.Sub(start)
|
||||
avg := total / time.Duration(count)
|
||||
|
||||
log.Printf("Benchmark time=%s, count=%d, average total time=%s\n",
|
||||
total.String(), count, avg.String(),
|
||||
)
|
||||
}
|
||||
|
||||
// resizeKeepAspectRatio resizes an image to a desired width and height while
|
||||
// maintaining the aspect ratio. The resulting image is centered with black
|
||||
// letterboxing where necessary.
|
||||
func resizeKeepAspectRatio(srcImg gocv.Mat, dstImg *gocv.Mat, width, height int) {
|
||||
|
||||
// calculate the ratio of the original image
|
||||
srcWidth := srcImg.Cols()
|
||||
srcHeight := srcImg.Rows()
|
||||
srcRatio := float64(srcWidth) / float64(srcHeight)
|
||||
|
||||
// calculate the ratio of the new dimensions
|
||||
dstRatio := float64(width) / float64(height)
|
||||
|
||||
newWidth, newHeight := width, height
|
||||
|
||||
// adjust dimensions to maintain aspect ratio
|
||||
if srcRatio > dstRatio {
|
||||
newHeight = int(float64(width) / srcRatio)
|
||||
} else {
|
||||
newWidth = int(float64(height) * srcRatio)
|
||||
}
|
||||
|
||||
// resize the original image to the new size that fits within the desired dimensions
|
||||
resizedImg := gocv.NewMat()
|
||||
gocv.Resize(srcImg, &resizedImg, image.Pt(newWidth, newHeight), 0, 0, gocv.InterpolationLinear)
|
||||
defer resizedImg.Close()
|
||||
|
||||
// ensure destination Mat is the correct size and type
|
||||
if dstImg.Empty() {
|
||||
*dstImg = gocv.NewMatWithSize(height, width, gocv.MatTypeCV8UC3)
|
||||
}
|
||||
|
||||
// create a black image
|
||||
dstImg.SetTo(gocv.NewScalar(0, 0, 0, 0))
|
||||
|
||||
// find the top-left corner coordinates to center the resized image
|
||||
//x := (width - newWidth) / 2
|
||||
y := (height - newHeight) / 2
|
||||
x := 0
|
||||
|
||||
// define a region of interest (ROI) within the final image where the
|
||||
// resized image will be placed
|
||||
roi := dstImg.Region(image.Rect(x, y, x+newWidth, y+newHeight))
|
||||
resizedImg.CopyTo(&roi)
|
||||
roi.Close()
|
||||
}
|
||||
|
||||
func optionalQueries(rt *rknnlite.Runtime) ([]rknnlite.TensorAttr, []rknnlite.TensorAttr) {
|
||||
|
||||
// get SDK version
|
||||
ver, err := rt.SDKVersion()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Error initializing RKNN runtime: ", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Driver Version: %s, API Version: %s\n", ver.DriverVersion, ver.APIVersion)
|
||||
|
||||
// get model input and output numbers
|
||||
num, err := rt.QueryModelIONumber()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Error querying IO Numbers: ", err)
|
||||
}
|
||||
|
||||
log.Printf("Model Input Number: %d, Ouput Number: %d\n", num.NumberInput, num.NumberOutput)
|
||||
|
||||
// query Input tensors
|
||||
inputAttrs, err := rt.QueryInputTensors()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Error querying Input Tensors: ", err)
|
||||
}
|
||||
|
||||
log.Println("Input tensors:")
|
||||
|
||||
for _, attr := range inputAttrs {
|
||||
log.Printf(" %s\n", attr.String())
|
||||
}
|
||||
|
||||
// query Output tensors
|
||||
outputAttrs, err := rt.QueryOutputTensors()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("Error querying Output Tensors: ", err)
|
||||
}
|
||||
|
||||
log.Println("Output tensors:")
|
||||
|
||||
for _, attr := range outputAttrs {
|
||||
log.Printf(" %s\n", attr.String())
|
||||
}
|
||||
|
||||
return inputAttrs, outputAttrs
|
||||
}
|
BIN
example/ppocr-rec/region.jpg
Normal file
BIN
example/ppocr-rec/region.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 16 KiB |
47
inference.go
47
inference.go
@@ -3,6 +3,7 @@ package rknnlite
|
||||
/*
|
||||
#include "rknn_api.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
@@ -47,22 +48,40 @@ func (r *Runtime) Inference(mats []gocv.Mat) (*Outputs, error) {
|
||||
mat = mat.Clone()
|
||||
}
|
||||
|
||||
// cast to float32, as PassThrough below is set to false then RKNN
|
||||
// will convert the input values to that of the tensor inputs in the model,
|
||||
// eg: INT8
|
||||
data, err := mat.DataPtrUint8()
|
||||
if r.inputTypeFloat32 {
|
||||
// pass data as float32 to RKNN backend
|
||||
data, err := mat.DataPtrFloat32()
|
||||
|
||||
if err != nil {
|
||||
return &Outputs{}, fmt.Errorf("error converting image to float32: %w", err)
|
||||
}
|
||||
if err != nil {
|
||||
return &Outputs{}, fmt.Errorf("error getting data pointer to Mat: %w", err)
|
||||
}
|
||||
|
||||
inputs[idx] = Input{
|
||||
Index: uint32(idx),
|
||||
Type: TensorUint8,
|
||||
Size: uint32(mat.Cols() * mat.Rows() * mat.Channels()),
|
||||
Fmt: TensorNHWC,
|
||||
Buf: unsafe.Pointer(&data[0]),
|
||||
PassThrough: false,
|
||||
inputs[idx] = Input{
|
||||
Index: uint32(idx),
|
||||
Type: TensorFloat32,
|
||||
// multiply by 4 for size of float32
|
||||
Size: uint32(mat.Cols() * mat.Rows() * mat.Channels() * 4),
|
||||
Fmt: TensorNHWC,
|
||||
Buf: unsafe.Pointer(&data[0]),
|
||||
PassThrough: false,
|
||||
}
|
||||
|
||||
} else {
|
||||
// pass data as uint8 to RKNN backend
|
||||
data, err := mat.DataPtrUint8()
|
||||
|
||||
if err != nil {
|
||||
return &Outputs{}, fmt.Errorf("error getting data pointer to Mat: %w", err)
|
||||
}
|
||||
|
||||
inputs[idx] = Input{
|
||||
Index: uint32(idx),
|
||||
Type: TensorUint8,
|
||||
Size: uint32(mat.Cols() * mat.Rows() * mat.Channels()),
|
||||
Fmt: TensorNHWC,
|
||||
Buf: unsafe.Pointer(&data[0]),
|
||||
PassThrough: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -28,6 +28,12 @@ func LoadLabels(file string) ([]string, error) {
|
||||
// read and trim each line
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
|
||||
// handle special keyword to convert to " " this is needed for
|
||||
// PPOCR key list
|
||||
if line == "__space__" {
|
||||
line = " "
|
||||
}
|
||||
labels = append(labels, line)
|
||||
}
|
||||
|
||||
|
120
postprocess/ppocr.go
Normal file
120
postprocess/ppocr.go
Normal file
@@ -0,0 +1,120 @@
|
||||
package postprocess
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/swdee/go-rknnlite"
|
||||
"math"
|
||||
)
|
||||
|
||||
// PPOCR defines the struct for the PPOCR model inference post processing
|
||||
type PPOCR struct {
|
||||
Params PPOCRParams
|
||||
}
|
||||
|
||||
// PPOCRParams defines the struct containing the PPOCR parameters to use for
|
||||
// post processing operations
|
||||
type PPOCRParams struct {
|
||||
// ModelChars is the list of characters used to train the PPOCR model
|
||||
ModelChars []string
|
||||
// numChars is the number of characters in ModelChars
|
||||
numChar int
|
||||
// OutputSeqLen is the length of sequence output data from the OCR model
|
||||
OutputSeqLen int
|
||||
}
|
||||
|
||||
// RecogniseResult is a text result recognised by OCR
|
||||
type RecogniseResult struct {
|
||||
// Text is the recognised text
|
||||
Text string
|
||||
// Score is the confidence score of the text recognised
|
||||
Score float32
|
||||
}
|
||||
|
||||
// NewPPOCR returns an instance of the PPOCR post processor
|
||||
func NewPPOCR(param PPOCRParams) *PPOCR {
|
||||
p := &PPOCR{
|
||||
Params: param,
|
||||
}
|
||||
|
||||
p.Params.numChar = len(param.ModelChars)
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
// Recognise takes the RKNN outputs and converts them to text
|
||||
func (p *PPOCR) Recognise(outputs *rknnlite.Outputs) []RecogniseResult {
|
||||
|
||||
results := make([]RecogniseResult, len(outputs.Output))
|
||||
|
||||
for idx, output := range outputs.Output {
|
||||
rec, err := p.recogniseText(output)
|
||||
|
||||
if err != nil {
|
||||
results[idx] = RecogniseResult{
|
||||
Text: "ERROR ModelChars",
|
||||
Score: 0,
|
||||
}
|
||||
} else {
|
||||
results[idx] = rec
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// recogniseText takes a single RKNN Output and returns the OCR'd text as string
|
||||
func (p *PPOCR) recogniseText(output rknnlite.Output) (RecogniseResult, error) {
|
||||
|
||||
res := RecogniseResult{}
|
||||
|
||||
var argmaxVal float32
|
||||
var argmaxIdx, lastIdx, count int
|
||||
|
||||
for n := 0; n < p.Params.OutputSeqLen; n++ {
|
||||
|
||||
offset := n * p.Params.numChar
|
||||
argmaxIdx, argmaxVal = p.argMax(output.BufFloat[offset : offset+p.Params.numChar])
|
||||
|
||||
if argmaxIdx > 0 && !(n > 0 && argmaxIdx == lastIdx) {
|
||||
// add to score max value
|
||||
res.Score += argmaxVal
|
||||
count++
|
||||
|
||||
if argmaxIdx > p.Params.numChar {
|
||||
return RecogniseResult{}, fmt.Errorf("output index is larger than size of ModelChars list")
|
||||
}
|
||||
|
||||
res.Text += p.Params.ModelChars[argmaxIdx]
|
||||
}
|
||||
|
||||
lastIdx = argmaxIdx
|
||||
}
|
||||
|
||||
res.Score /= float32(count) + 1e-6
|
||||
|
||||
if count == 0 || math.IsNaN(float64(res.Score)) {
|
||||
res.Score = 0.0
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// argMax returns the index of the maximum element in a slice
|
||||
func (p *PPOCR) argMax(slice []float32) (int, float32) {
|
||||
|
||||
if len(slice) == 0 {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
maxIdx := 0
|
||||
maxValue := slice[0]
|
||||
|
||||
for i, value := range slice {
|
||||
if value > maxValue {
|
||||
maxValue = value
|
||||
maxIdx = i
|
||||
}
|
||||
}
|
||||
|
||||
return maxIdx, maxValue
|
||||
}
|
10
runtime.go
10
runtime.go
@@ -99,6 +99,9 @@ type Runtime struct {
|
||||
// wantFloat indicates if Outputs are converted to float32 or left as int8.
|
||||
// default option is True
|
||||
wantFloat bool
|
||||
// inputTypeFloat32 indicates if we pass the input gocv.Mat's data as float32
|
||||
// to the RKNN backend
|
||||
inputTypeFloat32 bool
|
||||
}
|
||||
|
||||
// NewRuntime returns a RKNN run time instance. Provide the full path and
|
||||
@@ -211,6 +214,13 @@ func (r *Runtime) SetWantFloat(val bool) {
|
||||
r.wantFloat = val
|
||||
}
|
||||
|
||||
// SetInputTypeFloat32 defines if the Model requires the Inference() function to
|
||||
// pass the gocv.Mat's as float32 data to RKNN backend. Setting this overrides
|
||||
// the default behaviour to pass gocv.Mat data as Uint8
|
||||
func (r *Runtime) SetInputTypeFloat32(val bool) {
|
||||
r.inputTypeFloat32 = val
|
||||
}
|
||||
|
||||
// SDKVersion represents the C.rknn_sdk_version struct
|
||||
type SDKVersion struct {
|
||||
DriverVersion string
|
||||
|
Reference in New Issue
Block a user