From 7a0b4e5d979a50042221e843179ce1c00a01312b Mon Sep 17 00:00:00 2001 From: swdee Date: Sun, 22 Sep 2024 14:34:22 +1200 Subject: [PATCH] changed float16 ro float32 from lookup table to CGO call --- float16.go | 33 +++++++++++++++++++++++++-------- inference.go | 6 ++---- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/float16.go b/float16.go index a7c5cac..46cf697 100644 --- a/float16.go +++ b/float16.go @@ -1,13 +1,30 @@ package rknnlite -import "github.com/x448/float16" +/* +#cgo CFLAGS: -march=native -mtune=native -Ofast -flto +#cgo LDFLAGS: -march=native -mtune=native -Ofast -var f16LookupTable [65536]float32 +#include -func init() { - // precompute float16 lookup table for faster conversion to float32 - for i := range f16LookupTable { - f16 := float16.Frombits(uint16(i)) - f16LookupTable[i] = f16.Float32() - } +void float16_to_float32_buffer(const uint16_t* input, float* output, size_t count) { + for (size_t i = 0; i < count; i++) { + _Float16 tmp = *(_Float16*)&input[i]; + output[i] = (float)tmp; + } +} + +*/ +import "C" +import ( + "unsafe" +) + +// float16toFloat32Buffer takes a float16 and 32 buffer and converts it using +// optimisation via C +func float16ToFloat32Buffer(float16Buf []uint16, float32Buf []float32) { + C.float16_to_float32_buffer( + (*C.uint16_t)(unsafe.Pointer(&float16Buf[0])), // Pointer to the input buffer + (*C.float)(unsafe.Pointer(&float32Buf[0])), // Pointer to the output buffer + C.size_t(len(float16Buf)), // Number of elements to convert + ) } diff --git a/inference.go b/inference.go index b3f961c..e03c360 100644 --- a/inference.go +++ b/inference.go @@ -236,11 +236,9 @@ func (r *Runtime) GetOutputs(nOutputs uint32, wantFloat bool) (*Outputs, error) // convertFloat16BufferToFloat32 converts a float16 buffer to float32 as Go // has not support for FP16. func convertFloat16BufferToFloat32(float16Buf []uint16) []float32 { - float32Buf := make([]float32, len(float16Buf)) - for i, val := range float16Buf { - float32Buf[i] = f16LookupTable[val] - } + float32Buf := make([]float32, len(float16Buf)) + float16ToFloat32Buffer(float16Buf, float32Buf) return float32Buf }