package main import ( "flag" "fmt" "github.com/swdee/go-rknnlite" "github.com/swdee/go-rknnlite/postprocess" "github.com/swdee/go-rknnlite/preprocess" "github.com/swdee/go-rknnlite/render" "gocv.io/x/gocv" "image" "log" "math" "os" "sort" "strings" "time" ) func main() { // disable logging timestamps log.SetFlags(0) // read in cli flags detectModelFile := flag.String("d", "../../data/models/rk3588/ppocrv4_det-rk3588.rknn", "RKNN compiled model file for OCR Detection") recogniseModelFile := flag.String("r", "../../data/models/rk3588/ppocrv4_rec-rk3588.rknn", "RKNN compiled model file for OCR Recognition") keysFile := flag.String("k", "../../data/ppocr_keys_v1.txt", "Text file containing OCR character keys") imgFile := flag.String("i", "../../data/ppocr-det-test.png", "Image file to run inference on") rkPlatform := flag.String("p", "rk3588", "Rockchip CPU Model number [rk3562|rk3566|rk3568|rk3576|rk3582|rk3582|rk3588]") flag.Parse() err := rknnlite.SetCPUAffinityByPlatform(*rkPlatform, rknnlite.FastCores) if err != nil { log.Printf("Failed to set CPU Affinity: %v\n", err) } // check if user specified model file or if default is being used. if default // then pick the default platform model to use. if f := flag.Lookup("d"); f != nil && f.Value.String() == f.DefValue && *rkPlatform != "rk3588" { *detectModelFile = strings.ReplaceAll(*detectModelFile, "rk3588", *rkPlatform) } if f := flag.Lookup("r"); f != nil && f.Value.String() == f.DefValue && *rkPlatform != "rk3588" { *recogniseModelFile = strings.ReplaceAll(*recogniseModelFile, "rk3588", *rkPlatform) } // create rknn runtime instance detectRt, err := rknnlite.NewRuntimeByPlatform(*rkPlatform, *detectModelFile) if err != nil { log.Fatal("Error initializing Detect RKNN runtime: ", err) } recogniseRt, err := rknnlite.NewRuntimeByPlatform(*rkPlatform, *recogniseModelFile) if err != nil { log.Fatal("Error initializing Recognise RKNN runtime: ", err) } // set runtime to pass input gocv.Mat's to Inference() function as float32 // to RKNN backend recogniseRt.SetInputTypeFloat32(true) // optional querying of model file tensors and SDK version for printing // to stdout. not necessary for production inference code err = recogniseRt.Query(os.Stdout) if err != nil { log.Fatal("Error querying runtime: ", err) } err = detectRt.Query(os.Stdout) if err != nil { log.Fatal("Error querying runtime: ", err) } // load in Model character labels modelChars, err := rknnlite.LoadLabels(*keysFile) if err != nil { log.Fatal("Error loading model OCR character keys: ", err) } // check that we have as many modelChars as tensor outputs dimension if len(modelChars) != int(recogniseRt.OutputAttrs()[0].Dims[2]) { log.Fatalf("OCR character keys text input has %d characters and does "+ "not match the required number in the Model of %d", len(modelChars), recogniseRt.OutputAttrs()[0].Dims[2]) } // create PPOCR post processor recogniseProcessor := postprocess.NewPPOCRRecognise(postprocess.PPOCRRecogniseParams{ ModelChars: modelChars, OutputSeqLen: int(recogniseRt.InputAttrs()[0].Dims[2]) / 8, // modelWidth (320/8) }) detectProcessor := postprocess.NewPPOCRDetect(postprocess.PPOCRDetectParams{ Threshold: 0.3, BoxThreshold: 0.6, Dilation: false, BoxType: "poly", UnclipRatio: 1.5, ScoreMode: "slow", ModelWidth: int(detectRt.InputAttrs()[0].Dims[2]), ModelHeight: int(detectRt.InputAttrs()[0].Dims[1]), }) // load image img := gocv.IMRead(*imgFile, gocv.IMReadColor) if img.Empty() { log.Fatal("Error reading image from: ", *imgFile) } // resize image to 480x480 and keep aspect ratio, centered with black letterboxing resizedImg := gocv.NewMat() resizer := preprocess.NewResizer(img.Cols(), img.Rows(), int(detectRt.InputAttrs()[0].Dims[2]), int(detectRt.InputAttrs()[0].Dims[1]), ) resizer.LetterBoxResize(img, &resizedImg, render.Black) defer img.Close() defer resizedImg.Close() defer resizer.Close() start := time.Now() // perform inference on image file outputs, err := detectRt.Inference([]gocv.Mat{resizedImg}) if err != nil { log.Fatal("Runtime inferencing failed with error: ", err) } // work out scale ratio between source image and resized image scaleW := float32(img.Cols()) / float32(resizedImg.Cols()) scaleH := float32(img.Rows()) / float32(resizedImg.Rows()) results := detectProcessor.Detect(outputs, scaleW, scaleH) // sort results in order from top to bottom and left to right SortBoxes(&results) endDetect := time.Now() // also start recognise // create Mat for cropped region of text region := gocv.NewMat() defer region.Close() for _, result := range results { for i, box := range result.Box { fmt.Printf("[%d]: [(%d, %d), (%d, %d), (%d, %d), (%d, %d)] %f\n", i, box.LeftTop.X, box.LeftTop.Y, box.RightTop.X, box.RightTop.Y, box.RightBottom.X, box.RightBottom.Y, box.LeftBottom.X, box.LeftBottom.Y, box.Score) GetRotateCropImage(img, ®ion, box) // perform text recognition recogniseTextBlock(recogniseRt, recogniseProcessor, region, int(recogniseRt.InputAttrs()[0].Dims[2]), int(recogniseRt.InputAttrs()[0].Dims[1])) } } endRecognise := time.Now() log.Printf("Run speed:\n Detect processing=%s\n"+ " Recognise processing=%s\n"+ " Total time=%s\n", endDetect.Sub(start).String(), endRecognise.Sub(endDetect).String(), endRecognise.Sub(start).String(), ) // free outputs allocated in C memory after you have finished post processing err = outputs.Free() if err != nil { log.Fatal("Error freeing Outputs: ", err) } // close runtime and release resources err = detectRt.Close() if err != nil { log.Fatal("Error closing Detection RKNN runtime: ", err) } err = recogniseRt.Close() if err != nil { log.Fatal("Error closing Recognition RKNN runtime: ", err) } log.Println("done") } func recogniseTextBlock(recogniseRt *rknnlite.Runtime, recogniseProcessor *postprocess.PPOCRRecognise, img gocv.Mat, inWidth, inHeight int) { // resize image to 320x48 and keep aspect ratio, centered with black letterboxing resizedImg := gocv.NewMat() resizer := preprocess.NewResizer(img.Cols(), img.Rows(), inWidth, inHeight, ) resizer.LetterBoxResize(img, &resizedImg, render.Black) // convert image to float32 in 3 channels resizedImg.ConvertTo(&resizedImg, gocv.MatTypeCV32FC3) // normalize the image (img - 127.5) / 127.5 resizedImg.AddFloat(-127.5) resizedImg.DivideFloat(127.5) defer resizedImg.Close() defer resizer.Close() // perform inference on image file outputs, err := recogniseRt.Inference([]gocv.Mat{resizedImg}) if err != nil { log.Fatal("Runtime inferencing failed with error: ", err) } results := recogniseProcessor.Recognise(outputs) for _, result := range results { log.Printf("Recognize result: %s, score=%.2f", result.Text, result.Score) } // free outputs allocated in C memory after you have finished post processing err = outputs.Free() if err != nil { log.Fatal("Error freeing Outputs: ", err) } } // CompareBox compares two boxes func CompareBox(box1, box2 postprocess.PPOCRBox) bool { if box1.LeftTop.Y < box2.LeftTop.Y { return true } else if box1.LeftTop.Y == box2.LeftTop.Y { return box1.LeftTop.X < box2.LeftTop.X } else { return false } } // SortBoxes sorts the boxes in PPOCRDetectResult and adjusts the order func SortBoxes(detectResults *[]postprocess.PPOCRDetectResult) { for _, result := range *detectResults { boxes := result.Box sort.Slice(boxes, func(i, j int) bool { return CompareBox(boxes[i], boxes[j]) }) if len(boxes) == 0 { continue } for i := 0; i < len(boxes)-1; i++ { for j := i; j >= 0; j-- { if math.Abs(float64(boxes[j+1].LeftTop.Y-boxes[j].LeftTop.Y)) < 10 && (boxes[j+1].LeftTop.X < boxes[j].LeftTop.X) { boxes[j], boxes[j+1] = boxes[j+1], boxes[j] } } } } } // GetRotateCropImage takes the source image and crops it to the bounding box // and rotates if needed. func GetRotateCropImage(srcImage gocv.Mat, dstImg *gocv.Mat, box postprocess.PPOCRBox) { // Crop the image rect := image.Rect(box.LeftTop.X, box.LeftTop.Y, box.RightBottom.X, box.RightBottom.Y) region := srcImage.Region(rect) imgCrop := region.Clone() defer imgCrop.Close() // Convert the box points to a slice of image.Point points := []image.Point{ {X: box.LeftTop.X, Y: box.LeftTop.Y}, {X: box.RightTop.X, Y: box.RightTop.Y}, {X: box.RightBottom.X, Y: box.RightBottom.Y}, {X: box.LeftBottom.X, Y: box.LeftBottom.Y}, } // Adjust the points to the coordinates of the cropped image left := minInt( box.LeftTop.X, box.RightTop.X, box.RightBottom.X, box.LeftBottom.X, ) top := minInt( box.LeftTop.Y, box.RightTop.Y, box.RightBottom.Y, box.LeftBottom.Y, ) // Adjust the points to the cropped region for i := range points { points[i].X -= left points[i].Y -= top } imgCropWidth := imgCrop.Cols() imgCropHeight := imgCrop.Rows() // Define the destination points for perspective transformation ptsStd := []image.Point{ {X: 0, Y: 0}, {X: imgCropWidth, Y: 0}, {X: imgCropWidth, Y: imgCropHeight}, {X: 0, Y: imgCropHeight}, } // Get the perspective transform matrix srcPoints := gocv.NewPointVectorFromPoints(points) dstPoints := gocv.NewPointVectorFromPoints(ptsStd) M := gocv.GetPerspectiveTransform(srcPoints, dstPoints) defer M.Close() srcPoints.Close() dstPoints.Close() // Apply the warp perspective transformation gocv.WarpPerspective(imgCrop, dstImg, M, image.Pt(imgCropWidth, imgCropHeight)) // Check if the image needs to be transposed and flipped if float32(dstImg.Rows()) >= float32(dstImg.Cols())*1.5 { srcCopy := gocv.NewMatWithSize(dstImg.Cols(), dstImg.Rows(), dstImg.Type()) gocv.Transpose(*dstImg, &srcCopy) gocv.Flip(srcCopy, &srcCopy, 0) *dstImg = srcCopy.Clone() srcCopy.Close() } } // minInt finds the min value in a slice of integers func minInt(nums ...int) int { min := nums[0] for _, v := range nums { if v < min { min = v } } return min }