translate netlib l2norm algorithm to asm and remove branches

Netlib algorithm reduces overflow while calculating the l2norm of a vector. Translated to asm while reducing branches in NaN and Inf checks. Overflow protection is equivalent to the Netlib standard implementation.
2025-09-27 03:26:04 +08:00 · 2019-11-03 21:11:46 +08:00
parent 90165046ad
commit 4e1ef9c972
6 changed files with 236 additions and 37 deletions
--- a/floats/floats_test.go
+++ b/floats/floats_test.go
@@ -36,7 +36,6 @@ func areSlicesSame(t *testing.T, truth, comp []float64, str string) {
 				break
 			}
 		}
 	}
 	if !ok {
 		t.Errorf(str+". Expected %v, returned %v", truth, comp)
@@ -96,7 +95,6 @@ func TestAddTo(t *testing.T) {
 	if !Panics(func() { AddTo(make([]float64, 3), make([]float64, 3), make([]float64, 2)) }) {
 		t.Errorf("Did not panic with length mismatch")
 	}
 }
 func TestAddConst(t *testing.T) {
@@ -208,7 +206,6 @@ func TestCumProd(t *testing.T) {
 	truth = []float64{}
 	CumProd(emptyReceiver, emptyReceiver)
 	areSlicesEqual(t, truth, emptyReceiver, "Wrong cumprod returned with empty receiver")
 }
 func TestCumSum(t *testing.T) {
@@ -231,7 +228,6 @@ func TestCumSum(t *testing.T) {
 	truth = []float64{}
 	CumSum(emptyReceiver, emptyReceiver)
 	areSlicesEqual(t, truth, emptyReceiver, "Wrong cumsum returned with empty receiver")
 }
 func TestDistance(t *testing.T) {
@@ -270,7 +266,6 @@ func TestDistance(t *testing.T) {
 	if !Panics(func() { Distance([]float64{}, norms, 1) }) {
 		t.Errorf("Did not panic with unequal lengths")
 	}
 }
 func TestDiv(t *testing.T) {
@@ -398,7 +393,7 @@ func TestEqualFunc(t *testing.T) {
 }
 func TestEqualsRelative(t *testing.T) {
-	var equalityTests = []struct {
+	equalityTests := []struct {
 		a, b  float64
 		tol   float64
 		equal bool
@@ -518,7 +513,6 @@ func TestEqualsULP(t *testing.T) {
 	if EqualWithinULP(1, math.NaN(), 10) {
 		t.Errorf("NaN returned as equal")
 	}
 }
 func TestEqualLengths(t *testing.T) {
@@ -699,7 +693,6 @@ func TestLogSumExp(t *testing.T) {
 	if math.Abs(val-truth) > EqTolerance {
 		t.Errorf("Wrong logsumexp for values with negative infinity")
 	}
 }
 func TestMaxAndIdx(t *testing.T) {
@@ -1572,7 +1565,6 @@ func TestWithin(t *testing.T) {
 			t.Errorf("Case %v: Idx mismatch. Want: %v, got: %v", i, test.idx, idx)
 		}
 	}
 }
 func randomSlice(l int) []float64 {
--- a/internal/asm/f64/l2norm.go
+++ b/internal/asm/f64/l2norm.go
@@ -6,33 +6,6 @@ package f64
 import "math"
 // L2NormUnitary is the level 2 norm of x.
 func L2NormUnitary(x []float64) (sum float64) {
 	var scale float64
 	sumSquares := 1.0
 	for _, v := range x {
 		if v == 0 {
 			continue
 		}
 		absxi := math.Abs(v)
 		if math.IsNaN(absxi) {
 			return math.NaN()
 		}
 		if scale < absxi {
 			s := scale / absxi
 			sumSquares = 1 + sumSquares*s*s
 			scale = absxi
 		} else {
 			s := absxi / scale
 			sumSquares += s * s
 		}
 	}
 	if math.IsInf(scale, 1) {
 		return math.Inf(1)
 	}
 	return scale * math.Sqrt(sumSquares)
 }
 // L2NormInc is the level 2 norm of x.
 func L2NormInc(x []float64, n, incX uintptr) (sum float64) {
 	var scale float64
--- a/internal/asm/f64/l2norm_amd64.s
+++ b/internal/asm/f64/l2norm_amd64.s
@@ -0,0 +1,109 @@
 // Copyright ©2019 The Gonum Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !noasm,!appengine,!safe
 #include "textflag.h"
 #define SUMSQ X0
 #define ABSX X1
 #define SCALE X2
 #define ZERO X3
 #define TMP X4
 #define ABSMASK X5
 #define INF X7
 #define INFMASK X11
 #define NANMASK X12
 #define IDX AX
 #define LEN SI
 #define X_ DI
 #define ABSMASK_DATA l2nrodata<>+0(SB)
 #define INF_DATA l2nrodata<>+8(SB)
 #define NAN_DATA l2nrodata<>+16(SB)
 // AbsMask
 DATA l2nrodata<>+0(SB)/8, $0x7FFFFFFFFFFFFFFF
 // Inf
 DATA l2nrodata<>+8(SB)/8, $0x7FF0000000000000
 // NaN
 DATA l2nrodata<>+16(SB)/8, $0xFFF8000000000000
 GLOBL l2nrodata<>+0(SB), RODATA, $24
 // L2NormUnitary returns the L2-norm of x.
 // func L2NormUnitary(x []float64) (sum float64)
 TEXT ·L2NormUnitary(SB), NOSPLIT, $0
 	MOVQ x_len+8(FP), LEN
 	MOVQ x_base+0(FP), X_
 	PXOR ZERO, ZERO
 	CMPQ LEN, $0          // if len== 0 {return 0}
 	JZ   retZero
 	PXOR  INFMASK, INFMASK
 	PXOR  NANMASK, NANMASK
 	MOVSD $1.0, SUMSQ                // ssq = 1
 	XORPS SCALE, SCALE
 	MOVSD l2nrodata<>+0(SB), ABSMASK
 	MOVSD l2nrodata<>+8(SB), INF
 	XORQ  IDX, IDX                   // idx == 0
 initZero:  // for ;x[i]==0; i++ {}
 	// Skip all leading zeros, to avoid divide by zero NaN
 	MOVSD   (X_)(IDX*8), ABSX // absxi = x[i]
 	UCOMISD ABSX, ZERO
 	JP      retNaN            // if isNaN(x[i]) { return NaN }
 	JNE     loop              // if x[i] != 0 { goto loop }
 	INCQ    IDX
 	CMPQ    IDX, LEN
 	JE      retZero           // if ++i == len(x) { return 0 }
 	JMP     initZero
 loop:
 	MOVSD   (X_)(IDX*8), ABSX // absxi = x[i]
 	MOVUPS  ABSX, TMP
 	CMPSD   ABSX, TMP, $3
 	ORPD    TMP, NANMASK      // NANMASK = NANMASK | IsNaN(absxi)
 	MOVSD   INF, TMP
 	ANDPD   ABSMASK, ABSX     // absxi == Abs(absxi)
 	CMPSD   ABSX, TMP, $0
 	ORPD    TMP, INFMASK      // INFMASK =  INFMASK | IsInf(absxi)
 	UCOMISD SCALE, ABSX
 	JA      adjScale          // IF SCALE > ABSXI { goto adjScale }
 	DIVSD SCALE, ABSX // absxi = scale / absxi
 	MULSD ABSX, ABSX  // absxi *= absxi
 	ADDSD ABSX, SUMSQ // sumsq += absxi
 	INCQ  IDX         // ++i
 	CMPQ  IDX, LEN
 	JNE   loop        // if i < len(x) { continue }
 	JMP   retSum      // if i == len(x) { goto retSum }
 adjScale:  // Scale > Absxi
 	DIVSD  ABSX, SCALE  // tmp = absxi / scale
 	MULSD  SCALE, SUMSQ // sumsq *= tmp
 	MULSD  SCALE, SUMSQ // sumsq *= tmp
 	ADDSD  $1.0, SUMSQ  // sumsq += 1
 	MOVUPS ABSX, SCALE  // scale = absxi
 	INCQ   IDX          // ++i
 	CMPQ   IDX, LEN
 	JNE    loop         // if i < len(x) { continue }
 retSum:  // Calculate return value
 	SQRTSD  SUMSQ, SUMSQ    // sumsq = sqrt(sumsq)
 	MULSD   SCALE, SUMSQ    // sumsq += scale
 	MOVQ    SUMSQ, R10      // tmp = sumsq
 	UCOMISD ZERO, INFMASK
 	CMOVQPS INF_DATA, R10   // if INFMASK { tmp = INF }
 	UCOMISD ZERO, NANMASK
 	CMOVQPS NAN_DATA, R10   // if NANMASK { tmp = NaN }
 	MOVQ    R10, sum+24(FP) // return tmp
 	RET
 retZero:
 	MOVSD ZERO, sum+24(FP) // return 0
 	RET
 retNaN:
 	MOVSD NAN_DATA, TMP   // return NaN
 	MOVSD TMP, sum+24(FP)
 	RET
--- a/internal/asm/f64/l2norm_noasm.go
+++ b/internal/asm/f64/l2norm_noasm.go
@@ -0,0 +1,36 @@
 // Copyright ©2019 The Gonum Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64 noasm appengine safe
 package f64
 import "math"
 // L2NormUnitary returns the L2-norm of x.
 func L2NormUnitary(x []float64) (sum float64) {
 	var scale float64
 	sumSquares := 1.0
 	for _, v := range x {
 		if v == 0 {
 			continue
 		}
 		absxi := math.Abs(v)
 		if math.IsNaN(absxi) {
 			return math.NaN()
 		}
 		if scale < absxi {
 			s := scale / absxi
 			sumSquares = 1 + sumSquares*s*s
 			scale = absxi
 		} else {
 			s := absxi / scale
 			sumSquares += s * s
 		}
 	}
 	if math.IsInf(scale, 1) {
 		return math.Inf(1)
 	}
 	return scale * math.Sqrt(sumSquares)
 }
--- a/internal/asm/f64/l2norm_test.go
+++ b/internal/asm/f64/l2norm_test.go
@@ -4,7 +4,20 @@
 package f64
-import "testing"
+import (
 	"fmt"
 	"math"
 	"testing"
 )
 // nanwith copied from floats package
 func nanwith(payload uint64) float64 {
 	const (
 		nanBits = 0x7ff8000000000000
 		nanMask = 0xfff8000000000000
 	)
 	return math.Float64frombits(nanBits | (payload &^ nanMask))
 }
 func TestL2NormUnitary(t *testing.T) {
 	var src_gd float64 = 1
@@ -17,6 +30,7 @@ func TestL2NormUnitary(t *testing.T) {
 		{want: 3.7416573867739413, x: []float64{1, 2, 3}},
 		{want: 3.7416573867739413, x: []float64{-1, -2, -3}},
 		{want: nan, x: []float64{nan}},
 		{want: nan, x: []float64{1, inf, 3, nanwith(25), 5}},
 		{want: 17.88854381999832, x: []float64{8, -8, 8, -8, 8}},
 		{want: 2.23606797749979, x: []float64{0, 1, 0, -1, 0, 1, 0, -1, 0, 1}},
 	} {
@@ -87,3 +101,52 @@ func TestL2DistanceUnitary(t *testing.T) {
 		}
 	}
 }
 func BenchmarkL2NormNetlib(b *testing.B) {
 	netlib := func(x []float64) (sum float64) {
 		var scale float64
 		sumSquares := 1.0
 		for _, v := range x {
 			if v == 0 {
 				continue
 			}
 			absxi := math.Abs(v)
 			if math.IsNaN(absxi) {
 				return math.NaN()
 			}
 			if scale < absxi {
 				s := scale / absxi
 				sumSquares = 1 + sumSquares*s*s
 				scale = absxi
 			} else {
 				s := absxi / scale
 				sumSquares += s * s
 			}
 		}
 		if math.IsInf(scale, 1) {
 			return math.Inf(1)
 		}
 		return scale * math.Sqrt(sumSquares)
 	}
 	tests := []struct {
 		name string
 		f    func(x []float64) float64
 	}{
 		{"L2NormUnitaryNetlib", netlib},
 		{"L2NormUnitary", L2NormUnitary},
 	}
 	x[0] = randomSlice(1, 1)[0] // replace the leading zero (edge case)
 	for _, test := range tests {
 		for _, ln := range []uintptr{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4, 1e5} {
 			b.Run(fmt.Sprintf("%s-%d", test.name, ln), func(b *testing.B) {
 				b.SetBytes(int64(64 * ln))
 				x := x[:ln]
 				b.ResetTimer()
 				for i := 0; i < b.N; i++ {
 					test.f(x)
 				}
 			})
 		}
 	}
 }
--- a/internal/asm/f64/stubs_amd64.go
+++ b/internal/asm/f64/stubs_amd64.go
@@ -170,3 +170,29 @@ func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, inc
 //      sum += x[i]
 //  }
 func Sum(x []float64) float64
 // L2NormUnitary returns the L2-norm of x.
 //   var scale float64
 //   sumSquares := 1.0
 //   for _, v := range x {
 //   	if v == 0 {
 //   		continue
 //   	}
 //   	absxi := math.Abs(v)
 //   	if math.IsNaN(absxi) {
 //   		return math.NaN()
 //   	}
 //   	if scale < absxi {
 //   		s := scale / absxi
 //   		sumSquares = 1 + sumSquares*s*s
 //   		scale = absxi
 //   	} else {
 //   		s := absxi / scale
 //   		sumSquares += s * s
 //   	}
 // 	  	if math.IsInf(scale, 1) {
 // 		  	return math.Inf(1)
 // 	  	}
 //   }
 //   return scale * math.Sqrt(sumSquares)
 func L2NormUnitary(x []float64) (sum float64)