translate netlib l2norm algorithm to asm and remove branches

Netlib algorithm reduces overflow while calculating the l2norm of a vector. Translated to asm while reducing branches in NaN and Inf checks. Overflow protection is equivalent to the Netlib standard implementation.
2025-09-27 03:26:04 +08:00 · 2019-11-03 21:11:46 +08:00
parent 90165046ad
commit 4e1ef9c972
6 changed files with 236 additions and 37 deletions
--- a/floats/floats_test.go
+++ b/floats/floats_test.go
@@ -36,7 +36,6 @@ func areSlicesSame(t *testing.T, truth, comp []float64, str string) {
 				break
 			}
 		}
-
 	}
 	if !ok {
 		t.Errorf(str+". Expected %v, returned %v", truth, comp)
@@ -96,7 +95,6 @@ func TestAddTo(t *testing.T) {
 	if !Panics(func() { AddTo(make([]float64, 3), make([]float64, 3), make([]float64, 2)) }) {
 		t.Errorf("Did not panic with length mismatch")
 	}
-
 }

 func TestAddConst(t *testing.T) {
@@ -208,7 +206,6 @@ func TestCumProd(t *testing.T) {
 	truth = []float64{}
 	CumProd(emptyReceiver, emptyReceiver)
 	areSlicesEqual(t, truth, emptyReceiver, "Wrong cumprod returned with empty receiver")
-
 }

 func TestCumSum(t *testing.T) {
@@ -231,7 +228,6 @@ func TestCumSum(t *testing.T) {
 	truth = []float64{}
 	CumSum(emptyReceiver, emptyReceiver)
 	areSlicesEqual(t, truth, emptyReceiver, "Wrong cumsum returned with empty receiver")
-
 }

 func TestDistance(t *testing.T) {
@@ -270,7 +266,6 @@ func TestDistance(t *testing.T) {
 	if !Panics(func() { Distance([]float64{}, norms, 1) }) {
 		t.Errorf("Did not panic with unequal lengths")
 	}
-
 }

 func TestDiv(t *testing.T) {
@@ -398,7 +393,7 @@ func TestEqualFunc(t *testing.T) {
 }

 func TestEqualsRelative(t *testing.T) {
-	var equalityTests = []struct {
+	equalityTests := []struct {
 		a, b  float64
 		tol   float64
 		equal bool
@@ -518,7 +513,6 @@ func TestEqualsULP(t *testing.T) {
 	if EqualWithinULP(1, math.NaN(), 10) {
 		t.Errorf("NaN returned as equal")
 	}
-
 }

 func TestEqualLengths(t *testing.T) {
@@ -699,7 +693,6 @@ func TestLogSumExp(t *testing.T) {
 	if math.Abs(val-truth) > EqTolerance {
 		t.Errorf("Wrong logsumexp for values with negative infinity")
 	}
-
 }

 func TestMaxAndIdx(t *testing.T) {
@@ -1572,7 +1565,6 @@ func TestWithin(t *testing.T) {
 			t.Errorf("Case %v: Idx mismatch. Want: %v, got: %v", i, test.idx, idx)
 		}
 	}
-
 }

 func randomSlice(l int) []float64 {
--- a/internal/asm/f64/l2norm.go
+++ b/internal/asm/f64/l2norm.go
@@ -6,33 +6,6 @@ package f64

 import "math"

-// L2NormUnitary is the level 2 norm of x.
-func L2NormUnitary(x []float64) (sum float64) {
-	var scale float64
-	sumSquares := 1.0
-	for _, v := range x {
-		if v == 0 {
-			continue
-		}
-		absxi := math.Abs(v)
-		if math.IsNaN(absxi) {
-			return math.NaN()
-		}
-		if scale < absxi {
-			s := scale / absxi
-			sumSquares = 1 + sumSquares*s*s
-			scale = absxi
-		} else {
-			s := absxi / scale
-			sumSquares += s * s
-		}
-	}
-	if math.IsInf(scale, 1) {
-		return math.Inf(1)
-	}
-	return scale * math.Sqrt(sumSquares)
-}
-
 // L2NormInc is the level 2 norm of x.
 func L2NormInc(x []float64, n, incX uintptr) (sum float64) {
 	var scale float64
--- a/internal/asm/f64/l2norm_amd64.s
+++ b/internal/asm/f64/l2norm_amd64.s
@@ -0,0 +1,109 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!appengine,!safe
+
+#include "textflag.h"
+
+#define SUMSQ X0
+#define ABSX X1
+#define SCALE X2
+#define ZERO X3
+#define TMP X4
+#define ABSMASK X5
+#define INF X7
+#define INFMASK X11
+#define NANMASK X12
+#define IDX AX
+#define LEN SI
+#define X_ DI
+
+#define ABSMASK_DATA l2nrodata<>+0(SB)
+#define INF_DATA l2nrodata<>+8(SB)
+#define NAN_DATA l2nrodata<>+16(SB)
+// AbsMask
+DATA l2nrodata<>+0(SB)/8, $0x7FFFFFFFFFFFFFFF
+// Inf
+DATA l2nrodata<>+8(SB)/8, $0x7FF0000000000000
+// NaN
+DATA l2nrodata<>+16(SB)/8, $0xFFF8000000000000
+GLOBL l2nrodata<>+0(SB), RODATA, $24
+
+// L2NormUnitary returns the L2-norm of x.
+// func L2NormUnitary(x []float64) (sum float64)
+TEXT ·L2NormUnitary(SB), NOSPLIT, $0
+	MOVQ x_len+8(FP), LEN
+	MOVQ x_base+0(FP), X_
+	PXOR ZERO, ZERO
+	CMPQ LEN, $0          // if len== 0 {return 0}
+	JZ   retZero
+
+	PXOR  INFMASK, INFMASK
+	PXOR  NANMASK, NANMASK
+	MOVSD $1.0, SUMSQ                // ssq = 1
+	XORPS SCALE, SCALE
+	MOVSD l2nrodata<>+0(SB), ABSMASK
+	MOVSD l2nrodata<>+8(SB), INF
+	XORQ  IDX, IDX                   // idx == 0
+
+initZero:  // for ;x[i]==0; i++ {}
+	// Skip all leading zeros, to avoid divide by zero NaN
+	MOVSD   (X_)(IDX*8), ABSX // absxi = x[i]
+	UCOMISD ABSX, ZERO
+	JP      retNaN            // if isNaN(x[i]) { return NaN }
+	JNE     loop              // if x[i] != 0 { goto loop }
+	INCQ    IDX
+	CMPQ    IDX, LEN
+	JE      retZero           // if ++i == len(x) { return 0 }
+	JMP     initZero
+
+loop:
+	MOVSD   (X_)(IDX*8), ABSX // absxi = x[i]
+	MOVUPS  ABSX, TMP
+	CMPSD   ABSX, TMP, $3
+	ORPD    TMP, NANMASK      // NANMASK = NANMASK | IsNaN(absxi)
+	MOVSD   INF, TMP
+	ANDPD   ABSMASK, ABSX     // absxi == Abs(absxi)
+	CMPSD   ABSX, TMP, $0
+	ORPD    TMP, INFMASK      // INFMASK =  INFMASK | IsInf(absxi)
+	UCOMISD SCALE, ABSX
+	JA      adjScale          // IF SCALE > ABSXI { goto adjScale }
+
+	DIVSD SCALE, ABSX // absxi = scale / absxi
+	MULSD ABSX, ABSX  // absxi *= absxi
+	ADDSD ABSX, SUMSQ // sumsq += absxi
+	INCQ  IDX         // ++i
+	CMPQ  IDX, LEN
+	JNE   loop        // if i < len(x) { continue }
+	JMP   retSum      // if i == len(x) { goto retSum }
+
+adjScale:  // Scale > Absxi
+	DIVSD  ABSX, SCALE  // tmp = absxi / scale
+	MULSD  SCALE, SUMSQ // sumsq *= tmp
+	MULSD  SCALE, SUMSQ // sumsq *= tmp
+	ADDSD  $1.0, SUMSQ  // sumsq += 1
+	MOVUPS ABSX, SCALE  // scale = absxi
+	INCQ   IDX          // ++i
+	CMPQ   IDX, LEN
+	JNE    loop         // if i < len(x) { continue }
+
+retSum:  // Calculate return value
+	SQRTSD  SUMSQ, SUMSQ    // sumsq = sqrt(sumsq)
+	MULSD   SCALE, SUMSQ    // sumsq += scale
+	MOVQ    SUMSQ, R10      // tmp = sumsq
+	UCOMISD ZERO, INFMASK
+	CMOVQPS INF_DATA, R10   // if INFMASK { tmp = INF }
+	UCOMISD ZERO, NANMASK
+	CMOVQPS NAN_DATA, R10   // if NANMASK { tmp = NaN }
+	MOVQ    R10, sum+24(FP) // return tmp
+	RET
+
+retZero:
+	MOVSD ZERO, sum+24(FP) // return 0
+	RET
+
+retNaN:
+	MOVSD NAN_DATA, TMP   // return NaN
+	MOVSD TMP, sum+24(FP)
+	RET
--- a/internal/asm/f64/l2norm_noasm.go
+++ b/internal/asm/f64/l2norm_noasm.go
@@ -0,0 +1,36 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 noasm appengine safe
+
+package f64
+
+import "math"
+
+// L2NormUnitary returns the L2-norm of x.
+func L2NormUnitary(x []float64) (sum float64) {
+	var scale float64
+	sumSquares := 1.0
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := math.Abs(v)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
--- a/internal/asm/f64/l2norm_test.go
+++ b/internal/asm/f64/l2norm_test.go
@@ -4,7 +4,20 @@

 package f64

-import "testing"
+import (
+	"fmt"
+	"math"
+	"testing"
+)
+
+// nanwith copied from floats package
+func nanwith(payload uint64) float64 {
+	const (
+		nanBits = 0x7ff8000000000000
+		nanMask = 0xfff8000000000000
+	)
+	return math.Float64frombits(nanBits | (payload &^ nanMask))
+}

 func TestL2NormUnitary(t *testing.T) {
 	var src_gd float64 = 1
@@ -17,6 +30,7 @@ func TestL2NormUnitary(t *testing.T) {
 		{want: 3.7416573867739413, x: []float64{1, 2, 3}},
 		{want: 3.7416573867739413, x: []float64{-1, -2, -3}},
 		{want: nan, x: []float64{nan}},
+		{want: nan, x: []float64{1, inf, 3, nanwith(25), 5}},
 		{want: 17.88854381999832, x: []float64{8, -8, 8, -8, 8}},
 		{want: 2.23606797749979, x: []float64{0, 1, 0, -1, 0, 1, 0, -1, 0, 1}},
 	} {
@@ -87,3 +101,52 @@ func TestL2DistanceUnitary(t *testing.T) {
 		}
 	}
 }
+
+func BenchmarkL2NormNetlib(b *testing.B) {
+	netlib := func(x []float64) (sum float64) {
+		var scale float64
+		sumSquares := 1.0
+		for _, v := range x {
+			if v == 0 {
+				continue
+			}
+			absxi := math.Abs(v)
+			if math.IsNaN(absxi) {
+				return math.NaN()
+			}
+			if scale < absxi {
+				s := scale / absxi
+				sumSquares = 1 + sumSquares*s*s
+				scale = absxi
+			} else {
+				s := absxi / scale
+				sumSquares += s * s
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(sumSquares)
+	}
+
+	tests := []struct {
+		name string
+		f    func(x []float64) float64
+	}{
+		{"L2NormUnitaryNetlib", netlib},
+		{"L2NormUnitary", L2NormUnitary},
+	}
+	x[0] = randomSlice(1, 1)[0] // replace the leading zero (edge case)
+	for _, test := range tests {
+		for _, ln := range []uintptr{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4, 1e5} {
+			b.Run(fmt.Sprintf("%s-%d", test.name, ln), func(b *testing.B) {
+				b.SetBytes(int64(64 * ln))
+				x := x[:ln]
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					test.f(x)
+				}
+			})
+		}
+	}
+}
--- a/internal/asm/f64/stubs_amd64.go
+++ b/internal/asm/f64/stubs_amd64.go
@@ -170,3 +170,29 @@ func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, inc
 //      sum += x[i]
 //  }
 func Sum(x []float64) float64
+
+// L2NormUnitary returns the L2-norm of x.
+//   var scale float64
+//   sumSquares := 1.0
+//   for _, v := range x {
+//   	if v == 0 {
+//   		continue
+//   	}
+//   	absxi := math.Abs(v)
+//   	if math.IsNaN(absxi) {
+//   		return math.NaN()
+//   	}
+//   	if scale < absxi {
+//   		s := scale / absxi
+//   		sumSquares = 1 + sumSquares*s*s
+//   		scale = absxi
+//   	} else {
+//   		s := absxi / scale
+//   		sumSquares += s * s
+//   	}
+// 	  	if math.IsInf(scale, 1) {
+// 		  	return math.Inf(1)
+// 	  	}
+//   }
+//   return scale * math.Sqrt(sumSquares)
+func L2NormUnitary(x []float64) (sum float64)