From 1c707a77a22f99c3ed5cb4dd244ca0823c0c387b Mon Sep 17 00:00:00 2001 From: Chad Kunde Date: Thu, 27 Apr 2017 21:30:40 -0700 Subject: [PATCH 1/5] asm/f64: Added alignment offset and updated randSlice with source. --- asm/f64/asm_test.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/asm/f64/asm_test.go b/asm/f64/asm_test.go index 76b8e37d..cca86e84 100644 --- a/asm/f64/asm_test.go +++ b/asm/f64/asm_test.go @@ -140,6 +140,7 @@ func same(a, b float64) bool { } var ( // Offset sets for testing alignment handling in Unitary assembly functions. + align1 = []int{0, 1} align2 = newIncSet(0, 1) align3 = newIncToSet(0, 1) ) @@ -190,3 +191,14 @@ func randomSlice(n, inc int) []float64 { } return x } + +func randSlice(n, inc int, r *rand.Rand) []float64 { + if inc < 0 { + inc = -inc + } + x := make([]float64, (n-1)*inc+1) + for i := range x { + x[i] = r.Float64() + } + return x +} From e7164049f8f6271c7501be0cb6db4502eb4f12f3 Mon Sep 17 00:00:00 2001 From: Chad Kunde Date: Thu, 27 Apr 2017 21:54:44 -0700 Subject: [PATCH 2/5] asm/f64: Cleaned up and updated scal* test code. Preparation for wide loops in asm code. --- asm/f64/scal_test.go | 335 +++++++++++-------------------------------- 1 file changed, 87 insertions(+), 248 deletions(-) diff --git a/asm/f64/scal_test.go b/asm/f64/scal_test.go index df3a0afb..d88b7d10 100644 --- a/asm/f64/scal_test.go +++ b/asm/f64/scal_test.go @@ -10,11 +10,16 @@ import ( "testing" ) -var dscalTests = []struct { +var scalTests = []struct { alpha float64 x []float64 want []float64 }{ + { + alpha: 0, + x: []float64{}, + want: []float64{}, + }, { alpha: 0, x: []float64{1}, @@ -60,281 +65,115 @@ var dscalTests = []struct { x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9}, want: []float64{0, 2, -4, 6, 8, -10, 12, -14, 16, 18}, }, + { + alpha: 3, + x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9, 12}, + want: []float64{0, 3, -6, 9, 12, -15, 18, -21, 24, 27, 36}, + }, } -func TestDscalUnitary(t *testing.T) { - for i, test := range dscalTests { - const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v" +func TestScalUnitary(t *testing.T) { + const xGdVal = -0.5 + for i, test := range scalTests { + for _, align := range align1 { + prefix := fmt.Sprintf("Test %v (x:%v)", i, align) + xgLn := 4 + align + xg := guardVector(test.x, xGdVal, xgLn) + x := xg[xgLn : len(xg)-xgLn] - prefix := fmt.Sprintf("test %v (x*=a)", i) - x, xFront, xBack := newGuardedVector(test.x, 1) - ScalUnitary(test.alpha, x) + ScalUnitary(test.alpha, x) - if !allNaN(xFront) || !allNaN(xBack) { - t.Errorf(msgGuard, prefix, "x", xFront, xBack) - } - - if !equalStrided(test.want, x, 1) { - t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x) + for i := range test.want { + if !same(x[i], test.want[i]) { + t.Errorf(msgVal, prefix, i, x[i], test.want[i]) + } + } + if !isValidGuard(xg, xGdVal, xgLn) { + t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:]) + } } } } -func TestDscalUnitaryTo(t *testing.T) { - for i, test := range dscalTests { - const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v" +func TestScalUnitaryTo(t *testing.T) { + const xGdVal, dstGdVal = -1, 0.5 + rng := rand.New(rand.NewSource(42)) + for i, test := range scalTests { + n := len(test.x) + for _, align := range align2 { + prefix := fmt.Sprintf("Test %v (x:%v dst:%v)", i, align.x, align.y) + xgLn, dgLn := 4+align.x, 4+align.y + xg := guardVector(test.x, xGdVal, xgLn) + dg := guardVector(randSlice(n, 1, rng), dstGdVal, dgLn) + x, dst := xg[xgLn:len(xg)-xgLn], dg[dgLn:len(dg)-dgLn] - // Test dst = alpha * x. - prefix := fmt.Sprintf("test %v (dst=a*x)", i) - x, xFront, xBack := newGuardedVector(test.x, 1) - dst, dstFront, dstBack := newGuardedVector(test.x, 1) - ScalUnitaryTo(dst, test.alpha, x) + ScalUnitaryTo(dst, test.alpha, x) - if !allNaN(xFront) || !allNaN(xBack) { - t.Errorf(msgGuard, prefix, "x", xFront, xBack) - } - if !allNaN(dstFront) || !allNaN(dstBack) { - t.Errorf(msgGuard, prefix, "dst", dstFront, dstBack) - } - if !equalStrided(test.x, x, 1) { - t.Errorf("%v: modified read-only x argument", prefix) - } - - if !equalStrided(test.want, dst, 1) { - t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, dst) - } - - // Test x = alpha * x. - prefix = fmt.Sprintf("test %v (x=a*x)", i) - x, xFront, xBack = newGuardedVector(test.x, 1) - ScalUnitaryTo(x, test.alpha, x) - - if !allNaN(xFront) || !allNaN(xBack) { - t.Errorf(msgGuard, prefix, "x", xFront, xBack) - } - - if !equalStrided(test.want, x, 1) { - t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x) + for i := range test.want { + if !same(dst[i], test.want[i]) { + t.Errorf(msgVal, prefix, i, dst[i], test.want[i]) + } + } + if !isValidGuard(xg, xGdVal, xgLn) { + t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:]) + } + if !isValidGuard(dg, dstGdVal, dgLn) { + t.Errorf(msgGuard, prefix, "y", dg[:dgLn], dg[len(dg)-dgLn:]) + } + if !equalStrided(test.x, x, 1) { + t.Errorf("%v: modified read-only x argument", prefix) + } } } } -func TestDscalInc(t *testing.T) { - const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v" - - for i, test := range dscalTests { +func TestScalInc(t *testing.T) { + const xGdVal = -0.5 + gdLn := 4 + for i, test := range scalTests { n := len(test.x) for _, incX := range []int{1, 2, 3, 4, 7, 10} { - prefix := fmt.Sprintf("test %v (x*=a), incX = %v", i, incX) - x, xFront, xBack := newGuardedVector(test.x, incX) + prefix := fmt.Sprintf("Test %v (x:%v)", i, incX) + xg := guardIncVector(test.x, xGdVal, incX, gdLn) + x := xg[gdLn : len(xg)-gdLn] + ScalInc(test.alpha, x, uintptr(n), uintptr(incX)) - if !allNaN(xFront) || !allNaN(xBack) { - t.Errorf(msgGuard, prefix, "x", xFront, xBack) - } - if nonStridedWrite(x, incX) { - t.Errorf("%v: modified x argument at non-stride position", prefix) - } - - if !equalStrided(test.want, x, incX) { - t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x) + for i := range test.want { + if !same(x[i*incX], test.want[i]) { + t.Errorf(msgVal, prefix, i, x[i*incX], test.want[i]) + } } + checkValidIncGuard(t, xg, xGdVal, incX, gdLn) } } } -func TestDscalIncTo(t *testing.T) { - const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v" - - for i, test := range dscalTests { +func TestScalIncTo(t *testing.T) { + const xGdVal, dstGdVal = -1, 0.5 + gdLn := 4 + rng := rand.New(rand.NewSource(42)) + for i, test := range scalTests { n := len(test.x) + for _, inc := range newIncSet(1, 2, 3, 4, 7, 10) { + prefix := fmt.Sprintf("test %v (x:%v dst:%v)", i, inc.x, inc.y) + xg := guardIncVector(test.x, xGdVal, inc.x, gdLn) + dg := guardIncVector(randSlice(n, 1, rng), dstGdVal, inc.y, gdLn) + x, dst := xg[gdLn:len(xg)-gdLn], dg[gdLn:len(dg)-gdLn] - for _, incX := range []int{1, 2, 3, 4, 7, 10} { - // Test x = alpha * x. - prefix := fmt.Sprintf("test %v (x=a*x), incX = %v", i, incX) - x, xFront, xBack := newGuardedVector(test.x, incX) - ScalIncTo(x, uintptr(incX), test.alpha, x, uintptr(n), uintptr(incX)) + ScalIncTo(dst, uintptr(inc.y), test.alpha, x, uintptr(n), uintptr(inc.x)) - if !allNaN(xFront) || !allNaN(xBack) { - t.Errorf(msgGuard, prefix, "x", xFront, xBack) - } - if nonStridedWrite(x, incX) { - t.Errorf("%v: modified x argument at non-stride position", prefix) - } - if !equalStrided(test.want, x, incX) { - t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x) - } - - for _, incDst := range []int{1, 2, 3, 4, 7, 10} { - // Test dst = alpha * x. - prefix = fmt.Sprintf("test %v (dst=a*x), incX = %v, incDst = %v", i, incX, incDst) - x, xFront, xBack = newGuardedVector(test.x, incX) - dst, dstFront, dstBack := newGuardedVector(test.x, incDst) - ScalIncTo(dst, uintptr(incDst), test.alpha, x, uintptr(n), uintptr(incX)) - - if !allNaN(xFront) || !allNaN(xBack) { - t.Errorf(msgGuard, prefix, "x", xFront, xBack) - } - if !allNaN(dstFront) || !allNaN(dstBack) { - t.Errorf(msgGuard, prefix, "dst", dstFront, dstBack) - } - if nonStridedWrite(x, incX) || !equalStrided(test.x, x, incX) { - t.Errorf("%v: modified read-only x argument", prefix) - } - if nonStridedWrite(dst, incDst) { - t.Errorf("%v: modified dst argument at non-stride position", prefix) - } - - if !equalStrided(test.want, dst, incDst) { - t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, dst) + for i := range test.want { + if !same(dst[i*inc.y], test.want[i]) { + t.Errorf(msgVal, prefix, i, dst[i*inc.y], test.want[i]) } } + checkValidIncGuard(t, xg, xGdVal, inc.x, gdLn) + checkValidIncGuard(t, dg, dstGdVal, inc.y, gdLn) + if !equalStrided(test.x, x, inc.x) { + t.Errorf("%v: modified read-only x argument", prefix) + } + } } } - -func BenchmarkDscalUnitaryN1(b *testing.B) { benchmarkDscalUnitary(b, 1) } -func BenchmarkDscalUnitaryN2(b *testing.B) { benchmarkDscalUnitary(b, 2) } -func BenchmarkDscalUnitaryN3(b *testing.B) { benchmarkDscalUnitary(b, 3) } -func BenchmarkDscalUnitaryN4(b *testing.B) { benchmarkDscalUnitary(b, 4) } -func BenchmarkDscalUnitaryN10(b *testing.B) { benchmarkDscalUnitary(b, 10) } -func BenchmarkDscalUnitaryN100(b *testing.B) { benchmarkDscalUnitary(b, 100) } -func BenchmarkDscalUnitaryN1000(b *testing.B) { benchmarkDscalUnitary(b, 1000) } -func BenchmarkDscalUnitaryN10000(b *testing.B) { benchmarkDscalUnitary(b, 10000) } -func BenchmarkDscalUnitaryN100000(b *testing.B) { benchmarkDscalUnitary(b, 100000) } - -func benchmarkDscalUnitary(b *testing.B, n int) { - x := randomSlice(n, 1) - b.ResetTimer() - for i := 0; i < b.N; i += 2 { - ScalUnitary(2, x) - ScalUnitary(0.5, x) - } - benchSink = x -} - -func BenchmarkDscalUnitaryToN1(b *testing.B) { benchmarkDscalUnitaryTo(b, 1) } -func BenchmarkDscalUnitaryToN2(b *testing.B) { benchmarkDscalUnitaryTo(b, 2) } -func BenchmarkDscalUnitaryToN3(b *testing.B) { benchmarkDscalUnitaryTo(b, 3) } -func BenchmarkDscalUnitaryToN4(b *testing.B) { benchmarkDscalUnitaryTo(b, 4) } -func BenchmarkDscalUnitaryToN10(b *testing.B) { benchmarkDscalUnitaryTo(b, 10) } -func BenchmarkDscalUnitaryToN100(b *testing.B) { benchmarkDscalUnitaryTo(b, 100) } -func BenchmarkDscalUnitaryToN1000(b *testing.B) { benchmarkDscalUnitaryTo(b, 1000) } -func BenchmarkDscalUnitaryToN10000(b *testing.B) { benchmarkDscalUnitaryTo(b, 10000) } -func BenchmarkDscalUnitaryToN100000(b *testing.B) { benchmarkDscalUnitaryTo(b, 100000) } - -func benchmarkDscalUnitaryTo(b *testing.B, n int) { - x := randomSlice(n, 1) - dst := randomSlice(n, 1) - a := rand.Float64() - b.ResetTimer() - for i := 0; i < b.N; i++ { - ScalUnitaryTo(dst, a, x) - } - benchSink = dst -} - -func BenchmarkDscalUnitaryToXN1(b *testing.B) { benchmarkDscalUnitaryToX(b, 1) } -func BenchmarkDscalUnitaryToXN2(b *testing.B) { benchmarkDscalUnitaryToX(b, 2) } -func BenchmarkDscalUnitaryToXN3(b *testing.B) { benchmarkDscalUnitaryToX(b, 3) } -func BenchmarkDscalUnitaryToXN4(b *testing.B) { benchmarkDscalUnitaryToX(b, 4) } -func BenchmarkDscalUnitaryToXN10(b *testing.B) { benchmarkDscalUnitaryToX(b, 10) } -func BenchmarkDscalUnitaryToXN100(b *testing.B) { benchmarkDscalUnitaryToX(b, 100) } -func BenchmarkDscalUnitaryToXN1000(b *testing.B) { benchmarkDscalUnitaryToX(b, 1000) } -func BenchmarkDscalUnitaryToXN10000(b *testing.B) { benchmarkDscalUnitaryToX(b, 10000) } -func BenchmarkDscalUnitaryToXN100000(b *testing.B) { benchmarkDscalUnitaryToX(b, 100000) } - -func benchmarkDscalUnitaryToX(b *testing.B, n int) { - x := randomSlice(n, 1) - b.ResetTimer() - for i := 0; i < b.N; i += 2 { - ScalUnitaryTo(x, 2, x) - ScalUnitaryTo(x, 0.5, x) - } - benchSink = x -} - -func BenchmarkDscalIncN1Inc1(b *testing.B) { benchmarkDscalInc(b, 1, 1) } - -func BenchmarkDscalIncN2Inc1(b *testing.B) { benchmarkDscalInc(b, 2, 1) } -func BenchmarkDscalIncN2Inc2(b *testing.B) { benchmarkDscalInc(b, 2, 2) } -func BenchmarkDscalIncN2Inc4(b *testing.B) { benchmarkDscalInc(b, 2, 4) } -func BenchmarkDscalIncN2Inc10(b *testing.B) { benchmarkDscalInc(b, 2, 10) } - -func BenchmarkDscalIncN3Inc1(b *testing.B) { benchmarkDscalInc(b, 3, 1) } -func BenchmarkDscalIncN3Inc2(b *testing.B) { benchmarkDscalInc(b, 3, 2) } -func BenchmarkDscalIncN3Inc4(b *testing.B) { benchmarkDscalInc(b, 3, 4) } -func BenchmarkDscalIncN3Inc10(b *testing.B) { benchmarkDscalInc(b, 3, 10) } - -func BenchmarkDscalIncN4Inc1(b *testing.B) { benchmarkDscalInc(b, 4, 1) } -func BenchmarkDscalIncN4Inc2(b *testing.B) { benchmarkDscalInc(b, 4, 2) } -func BenchmarkDscalIncN4Inc4(b *testing.B) { benchmarkDscalInc(b, 4, 4) } -func BenchmarkDscalIncN4Inc10(b *testing.B) { benchmarkDscalInc(b, 4, 10) } - -func BenchmarkDscalIncN10Inc1(b *testing.B) { benchmarkDscalInc(b, 10, 1) } -func BenchmarkDscalIncN10Inc2(b *testing.B) { benchmarkDscalInc(b, 10, 2) } -func BenchmarkDscalIncN10Inc4(b *testing.B) { benchmarkDscalInc(b, 10, 4) } -func BenchmarkDscalIncN10Inc10(b *testing.B) { benchmarkDscalInc(b, 10, 10) } - -func BenchmarkDscalIncN1000Inc1(b *testing.B) { benchmarkDscalInc(b, 1000, 1) } -func BenchmarkDscalIncN1000Inc2(b *testing.B) { benchmarkDscalInc(b, 1000, 2) } -func BenchmarkDscalIncN1000Inc4(b *testing.B) { benchmarkDscalInc(b, 1000, 4) } -func BenchmarkDscalIncN1000Inc10(b *testing.B) { benchmarkDscalInc(b, 1000, 10) } - -func BenchmarkDscalIncN100000Inc1(b *testing.B) { benchmarkDscalInc(b, 100000, 1) } -func BenchmarkDscalIncN100000Inc2(b *testing.B) { benchmarkDscalInc(b, 100000, 2) } -func BenchmarkDscalIncN100000Inc4(b *testing.B) { benchmarkDscalInc(b, 100000, 4) } -func BenchmarkDscalIncN100000Inc10(b *testing.B) { benchmarkDscalInc(b, 100000, 10) } - -func benchmarkDscalInc(b *testing.B, n, inc int) { - x := randomSlice(n, inc) - b.ResetTimer() - for i := 0; i < b.N; i += 2 { - ScalInc(2, x, uintptr(n), uintptr(inc)) - ScalInc(0.5, x, uintptr(n), uintptr(inc)) - } - benchSink = x -} - -func BenchmarkDscalIncToN1Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1, 1) } - -func BenchmarkDscalIncToN2Inc1(b *testing.B) { benchmarkDscalIncTo(b, 2, 1) } -func BenchmarkDscalIncToN2Inc2(b *testing.B) { benchmarkDscalIncTo(b, 2, 2) } -func BenchmarkDscalIncToN2Inc4(b *testing.B) { benchmarkDscalIncTo(b, 2, 4) } -func BenchmarkDscalIncToN2Inc10(b *testing.B) { benchmarkDscalIncTo(b, 2, 10) } - -func BenchmarkDscalIncToN3Inc1(b *testing.B) { benchmarkDscalIncTo(b, 3, 1) } -func BenchmarkDscalIncToN3Inc2(b *testing.B) { benchmarkDscalIncTo(b, 3, 2) } -func BenchmarkDscalIncToN3Inc4(b *testing.B) { benchmarkDscalIncTo(b, 3, 4) } -func BenchmarkDscalIncToN3Inc10(b *testing.B) { benchmarkDscalIncTo(b, 3, 10) } - -func BenchmarkDscalIncToN4Inc1(b *testing.B) { benchmarkDscalIncTo(b, 4, 1) } -func BenchmarkDscalIncToN4Inc2(b *testing.B) { benchmarkDscalIncTo(b, 4, 2) } -func BenchmarkDscalIncToN4Inc4(b *testing.B) { benchmarkDscalIncTo(b, 4, 4) } -func BenchmarkDscalIncToN4Inc10(b *testing.B) { benchmarkDscalIncTo(b, 4, 10) } - -func BenchmarkDscalIncToN10Inc1(b *testing.B) { benchmarkDscalIncTo(b, 10, 1) } -func BenchmarkDscalIncToN10Inc2(b *testing.B) { benchmarkDscalIncTo(b, 10, 2) } -func BenchmarkDscalIncToN10Inc4(b *testing.B) { benchmarkDscalIncTo(b, 10, 4) } -func BenchmarkDscalIncToN10Inc10(b *testing.B) { benchmarkDscalIncTo(b, 10, 10) } - -func BenchmarkDscalIncToN1000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1000, 1) } -func BenchmarkDscalIncToN1000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 1000, 2) } -func BenchmarkDscalIncToN1000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 1000, 4) } -func BenchmarkDscalIncToN1000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 1000, 10) } - -func BenchmarkDscalIncToN100000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 100000, 1) } -func BenchmarkDscalIncToN100000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 100000, 2) } -func BenchmarkDscalIncToN100000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 100000, 4) } -func BenchmarkDscalIncToN100000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 100000, 10) } - -func benchmarkDscalIncTo(b *testing.B, n, inc int) { - x := randomSlice(n, inc) - dst := randomSlice(n, inc) - a := rand.Float64() - b.ResetTimer() - for i := 0; i < b.N; i++ { - ScalIncTo(dst, uintptr(inc), a, x, uintptr(n), uintptr(inc)) - } - benchSink = dst -} From 58fee5277a9f1137a67903021729e2995ef78b6b Mon Sep 17 00:00:00 2001 From: Chad Kunde Date: Thu, 27 Apr 2017 21:55:36 -0700 Subject: [PATCH 3/5] asm/f64: Updated scal* benchmark code with go1.7 constructs. --- asm/f64/benchScal_test.go | 87 ++++++++++++++++++++++ asm/f64/bench_test.go | 152 +++++++++++++++++++++++++++++++++++++- 2 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 asm/f64/benchScal_test.go diff --git a/asm/f64/benchScal_test.go b/asm/f64/benchScal_test.go new file mode 100644 index 00000000..b4fb19ad --- /dev/null +++ b/asm/f64/benchScal_test.go @@ -0,0 +1,87 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.7 + +package f64 + +import ( + "fmt" + "testing" +) + +var uniScal = []int64{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4} + +func BenchmarkScalUnitary(t *testing.B) { + tstName := "ScalUnitary" + for _, ln := range uniScal { + t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) { + b.SetBytes(64 * ln) + x := x[:ln] + b.ResetTimer() + for i := 0; i < b.N; i++ { + ScalUnitary(a, x) + } + }) + } +} + +func BenchmarkScalUnitaryTo(t *testing.B) { + tstName := "ScalUnitaryTo" + for _, ln := range uniScal { + t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) { + b.SetBytes(int64(64 * ln)) + x, y := x[:ln], y[:ln] + b.ResetTimer() + for i := 0; i < b.N; i++ { + ScalUnitaryTo(y, a, x) + } + }) + } +} + +var incScal = []struct { + len uintptr + inc []int +}{ + {1, []int{1}}, + {3, []int{1, 2, 4, 10}}, + {10, []int{1, 2, 4, 10}}, + {30, []int{1, 2, 4, 10}}, + {1e2, []int{1, 2, 4, 10}}, + {3e2, []int{1, 2, 4, 10}}, + {1e3, []int{1, 2, 4, 10}}, + {3e3, []int{1, 2, 4, 10}}, + {1e4, []int{1, 2, 4, 10}}, +} + +func BenchmarkScalInc(t *testing.B) { + tstName := "ScalInc" + for _, tt := range incScal { + for _, inc := range tt.inc { + t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) { + b.SetBytes(int64(64 * tt.len)) + tstInc := uintptr(inc) + for i := 0; i < b.N; i++ { + ScalInc(a, x, uintptr(tt.len), tstInc) + } + }) + } + } +} + +func BenchmarkScalIncTo(t *testing.B) { + tstName := "ScalIncTo" + for _, tt := range incScal { + for _, inc := range tt.inc { + t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) { + b.SetBytes(int64(64 * tt.len)) + tstInc := uintptr(inc) + for i := 0; i < b.N; i++ { + ScalIncTo(z, tstInc, a, x, uintptr(tt.len), tstInc) + } + }) + } + } +} diff --git a/asm/f64/bench_test.go b/asm/f64/bench_test.go index 95cf46f4..78d26d77 100644 --- a/asm/f64/bench_test.go +++ b/asm/f64/bench_test.go @@ -6,7 +6,10 @@ package f64 -import "testing" +import ( + "math/rand" + "testing" +) var ( a = float64(2) @@ -286,3 +289,150 @@ func BenchmarkLF64AxpyIncToN100000IncM1(b *testing.B) { benchaxpyincto(b, 10000 func BenchmarkLF64AxpyIncToN100000IncM2(b *testing.B) { benchaxpyincto(b, 100000, -2, naiveaxpyincto) } func BenchmarkLF64AxpyIncToN100000IncM4(b *testing.B) { benchaxpyincto(b, 100000, -4, naiveaxpyincto) } func BenchmarkLF64AxpyIncToN100000IncM10(b *testing.B) { benchaxpyincto(b, 100000, -10, naiveaxpyincto) } + +// Scal* benchmarks +func BenchmarkDscalUnitaryN1(b *testing.B) { benchmarkDscalUnitary(b, 1) } +func BenchmarkDscalUnitaryN2(b *testing.B) { benchmarkDscalUnitary(b, 2) } +func BenchmarkDscalUnitaryN3(b *testing.B) { benchmarkDscalUnitary(b, 3) } +func BenchmarkDscalUnitaryN4(b *testing.B) { benchmarkDscalUnitary(b, 4) } +func BenchmarkDscalUnitaryN10(b *testing.B) { benchmarkDscalUnitary(b, 10) } +func BenchmarkDscalUnitaryN100(b *testing.B) { benchmarkDscalUnitary(b, 100) } +func BenchmarkDscalUnitaryN1000(b *testing.B) { benchmarkDscalUnitary(b, 1000) } +func BenchmarkDscalUnitaryN10000(b *testing.B) { benchmarkDscalUnitary(b, 10000) } +func BenchmarkDscalUnitaryN100000(b *testing.B) { benchmarkDscalUnitary(b, 100000) } + +func benchmarkDscalUnitary(b *testing.B, n int) { + x := randomSlice(n, 1) + b.ResetTimer() + for i := 0; i < b.N; i += 2 { + ScalUnitary(2, x) + ScalUnitary(0.5, x) + } + benchSink = x +} + +func BenchmarkDscalUnitaryToN1(b *testing.B) { benchmarkDscalUnitaryTo(b, 1) } +func BenchmarkDscalUnitaryToN2(b *testing.B) { benchmarkDscalUnitaryTo(b, 2) } +func BenchmarkDscalUnitaryToN3(b *testing.B) { benchmarkDscalUnitaryTo(b, 3) } +func BenchmarkDscalUnitaryToN4(b *testing.B) { benchmarkDscalUnitaryTo(b, 4) } +func BenchmarkDscalUnitaryToN10(b *testing.B) { benchmarkDscalUnitaryTo(b, 10) } +func BenchmarkDscalUnitaryToN100(b *testing.B) { benchmarkDscalUnitaryTo(b, 100) } +func BenchmarkDscalUnitaryToN1000(b *testing.B) { benchmarkDscalUnitaryTo(b, 1000) } +func BenchmarkDscalUnitaryToN10000(b *testing.B) { benchmarkDscalUnitaryTo(b, 10000) } +func BenchmarkDscalUnitaryToN100000(b *testing.B) { benchmarkDscalUnitaryTo(b, 100000) } + +func benchmarkDscalUnitaryTo(b *testing.B, n int) { + x := randomSlice(n, 1) + dst := randomSlice(n, 1) + a := rand.Float64() + b.ResetTimer() + for i := 0; i < b.N; i++ { + ScalUnitaryTo(dst, a, x) + } + benchSink = dst +} + +func BenchmarkDscalUnitaryToXN1(b *testing.B) { benchmarkDscalUnitaryToX(b, 1) } +func BenchmarkDscalUnitaryToXN2(b *testing.B) { benchmarkDscalUnitaryToX(b, 2) } +func BenchmarkDscalUnitaryToXN3(b *testing.B) { benchmarkDscalUnitaryToX(b, 3) } +func BenchmarkDscalUnitaryToXN4(b *testing.B) { benchmarkDscalUnitaryToX(b, 4) } +func BenchmarkDscalUnitaryToXN10(b *testing.B) { benchmarkDscalUnitaryToX(b, 10) } +func BenchmarkDscalUnitaryToXN100(b *testing.B) { benchmarkDscalUnitaryToX(b, 100) } +func BenchmarkDscalUnitaryToXN1000(b *testing.B) { benchmarkDscalUnitaryToX(b, 1000) } +func BenchmarkDscalUnitaryToXN10000(b *testing.B) { benchmarkDscalUnitaryToX(b, 10000) } +func BenchmarkDscalUnitaryToXN100000(b *testing.B) { benchmarkDscalUnitaryToX(b, 100000) } + +func benchmarkDscalUnitaryToX(b *testing.B, n int) { + x := randomSlice(n, 1) + b.ResetTimer() + for i := 0; i < b.N; i += 2 { + ScalUnitaryTo(x, 2, x) + ScalUnitaryTo(x, 0.5, x) + } + benchSink = x +} + +func BenchmarkDscalIncN1Inc1(b *testing.B) { benchmarkDscalInc(b, 1, 1) } + +func BenchmarkDscalIncN2Inc1(b *testing.B) { benchmarkDscalInc(b, 2, 1) } +func BenchmarkDscalIncN2Inc2(b *testing.B) { benchmarkDscalInc(b, 2, 2) } +func BenchmarkDscalIncN2Inc4(b *testing.B) { benchmarkDscalInc(b, 2, 4) } +func BenchmarkDscalIncN2Inc10(b *testing.B) { benchmarkDscalInc(b, 2, 10) } + +func BenchmarkDscalIncN3Inc1(b *testing.B) { benchmarkDscalInc(b, 3, 1) } +func BenchmarkDscalIncN3Inc2(b *testing.B) { benchmarkDscalInc(b, 3, 2) } +func BenchmarkDscalIncN3Inc4(b *testing.B) { benchmarkDscalInc(b, 3, 4) } +func BenchmarkDscalIncN3Inc10(b *testing.B) { benchmarkDscalInc(b, 3, 10) } + +func BenchmarkDscalIncN4Inc1(b *testing.B) { benchmarkDscalInc(b, 4, 1) } +func BenchmarkDscalIncN4Inc2(b *testing.B) { benchmarkDscalInc(b, 4, 2) } +func BenchmarkDscalIncN4Inc4(b *testing.B) { benchmarkDscalInc(b, 4, 4) } +func BenchmarkDscalIncN4Inc10(b *testing.B) { benchmarkDscalInc(b, 4, 10) } + +func BenchmarkDscalIncN10Inc1(b *testing.B) { benchmarkDscalInc(b, 10, 1) } +func BenchmarkDscalIncN10Inc2(b *testing.B) { benchmarkDscalInc(b, 10, 2) } +func BenchmarkDscalIncN10Inc4(b *testing.B) { benchmarkDscalInc(b, 10, 4) } +func BenchmarkDscalIncN10Inc10(b *testing.B) { benchmarkDscalInc(b, 10, 10) } + +func BenchmarkDscalIncN1000Inc1(b *testing.B) { benchmarkDscalInc(b, 1000, 1) } +func BenchmarkDscalIncN1000Inc2(b *testing.B) { benchmarkDscalInc(b, 1000, 2) } +func BenchmarkDscalIncN1000Inc4(b *testing.B) { benchmarkDscalInc(b, 1000, 4) } +func BenchmarkDscalIncN1000Inc10(b *testing.B) { benchmarkDscalInc(b, 1000, 10) } + +func BenchmarkDscalIncN100000Inc1(b *testing.B) { benchmarkDscalInc(b, 100000, 1) } +func BenchmarkDscalIncN100000Inc2(b *testing.B) { benchmarkDscalInc(b, 100000, 2) } +func BenchmarkDscalIncN100000Inc4(b *testing.B) { benchmarkDscalInc(b, 100000, 4) } +func BenchmarkDscalIncN100000Inc10(b *testing.B) { benchmarkDscalInc(b, 100000, 10) } + +func benchmarkDscalInc(b *testing.B, n, inc int) { + x := randomSlice(n, inc) + b.ResetTimer() + for i := 0; i < b.N; i += 2 { + ScalInc(2, x, uintptr(n), uintptr(inc)) + ScalInc(0.5, x, uintptr(n), uintptr(inc)) + } + benchSink = x +} + +func BenchmarkDscalIncToN1Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1, 1) } + +func BenchmarkDscalIncToN2Inc1(b *testing.B) { benchmarkDscalIncTo(b, 2, 1) } +func BenchmarkDscalIncToN2Inc2(b *testing.B) { benchmarkDscalIncTo(b, 2, 2) } +func BenchmarkDscalIncToN2Inc4(b *testing.B) { benchmarkDscalIncTo(b, 2, 4) } +func BenchmarkDscalIncToN2Inc10(b *testing.B) { benchmarkDscalIncTo(b, 2, 10) } + +func BenchmarkDscalIncToN3Inc1(b *testing.B) { benchmarkDscalIncTo(b, 3, 1) } +func BenchmarkDscalIncToN3Inc2(b *testing.B) { benchmarkDscalIncTo(b, 3, 2) } +func BenchmarkDscalIncToN3Inc4(b *testing.B) { benchmarkDscalIncTo(b, 3, 4) } +func BenchmarkDscalIncToN3Inc10(b *testing.B) { benchmarkDscalIncTo(b, 3, 10) } + +func BenchmarkDscalIncToN4Inc1(b *testing.B) { benchmarkDscalIncTo(b, 4, 1) } +func BenchmarkDscalIncToN4Inc2(b *testing.B) { benchmarkDscalIncTo(b, 4, 2) } +func BenchmarkDscalIncToN4Inc4(b *testing.B) { benchmarkDscalIncTo(b, 4, 4) } +func BenchmarkDscalIncToN4Inc10(b *testing.B) { benchmarkDscalIncTo(b, 4, 10) } + +func BenchmarkDscalIncToN10Inc1(b *testing.B) { benchmarkDscalIncTo(b, 10, 1) } +func BenchmarkDscalIncToN10Inc2(b *testing.B) { benchmarkDscalIncTo(b, 10, 2) } +func BenchmarkDscalIncToN10Inc4(b *testing.B) { benchmarkDscalIncTo(b, 10, 4) } +func BenchmarkDscalIncToN10Inc10(b *testing.B) { benchmarkDscalIncTo(b, 10, 10) } + +func BenchmarkDscalIncToN1000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1000, 1) } +func BenchmarkDscalIncToN1000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 1000, 2) } +func BenchmarkDscalIncToN1000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 1000, 4) } +func BenchmarkDscalIncToN1000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 1000, 10) } + +func BenchmarkDscalIncToN100000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 100000, 1) } +func BenchmarkDscalIncToN100000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 100000, 2) } +func BenchmarkDscalIncToN100000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 100000, 4) } +func BenchmarkDscalIncToN100000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 100000, 10) } + +func benchmarkDscalIncTo(b *testing.B, n, inc int) { + x := randomSlice(n, inc) + dst := randomSlice(n, inc) + a := rand.Float64() + b.ResetTimer() + for i := 0; i < b.N; i++ { + ScalIncTo(dst, uintptr(inc), a, x, uintptr(n), uintptr(inc)) + } + benchSink = dst +} From 7b2995e7fea74e95b285ecf845c69572e610f6a2 Mon Sep 17 00:00:00 2001 From: Chad Kunde Date: Thu, 27 Apr 2017 22:15:10 -0700 Subject: [PATCH 4/5] asm/f64: Defined macros for named variable registers in scal* code. --- asm/f64/scalinc_amd64.s | 57 ++++++++++++++++------------ asm/f64/scalincto_amd64.s | 71 ++++++++++++++++++++--------------- asm/f64/scalunitary_amd64.s | 58 ++++++++++++++++------------ asm/f64/scalunitaryto_amd64.s | 58 ++++++++++++++++------------ 4 files changed, 140 insertions(+), 104 deletions(-) diff --git a/asm/f64/scalinc_amd64.s b/asm/f64/scalinc_amd64.s index 01767d4c..386fbcd5 100644 --- a/asm/f64/scalinc_amd64.s +++ b/asm/f64/scalinc_amd64.s @@ -38,43 +38,52 @@ #include "textflag.h" -// func DscalInc(alpha float64, x []float64, n, incX uintptr) +#define X_PTR R8 +#define DST_PTR R9 +#define LEN DX +#define TAIL BX +#define INC_X R10 +#define INCx3_X R11 +#define ALPHA X7 +#define ALPHA_2 X1 + +// func ScalInc(alpha float64, x []float64, n, incX uintptr) TEXT ·ScalInc(SB), NOSPLIT, $0 - MOVHPD alpha+0(FP), X7 - MOVLPD alpha+0(FP), X7 - MOVQ x+8(FP), R8 - MOVQ n+32(FP), DX - MOVQ incX+40(FP), R10 + MOVHPD alpha+0(FP), ALPHA + MOVLPD alpha+0(FP), ALPHA + MOVQ x+8(FP), X_PTR + MOVQ n+32(FP), LEN + MOVQ incX+40(FP), INC_X MOVQ $0, SI - MOVQ R10, AX // nextX = incX - SHLQ $1, R10 // incX *= 2 + MOVQ INC_X, AX // nextX = incX + SHLQ $1, INC_X // incX *= 2 - SUBQ $2, DX // n -= 2 - JL tail // if n < 0 + SUBQ $2, LEN // n -= 2 + JL tail // if n < 0 loop: // x[i] *= alpha unrolled 2x. - MOVHPD 0(R8)(SI*8), X0 - MOVLPD 0(R8)(AX*8), X0 - MULPD X7, X0 - MOVHPD X0, 0(R8)(SI*8) - MOVLPD X0, 0(R8)(AX*8) + MOVHPD 0(X_PTR)(SI*8), X0 + MOVLPD 0(X_PTR)(AX*8), X0 + MULPD ALPHA, X0 + MOVHPD X0, 0(X_PTR)(SI*8) + MOVLPD X0, 0(X_PTR)(AX*8) - ADDQ R10, SI // ix += incX - ADDQ R10, AX // nextX += incX + ADDQ INC_X, SI // ix += incX + ADDQ INC_X, AX // nextX += incX - SUBQ $2, DX // n -= 2 - JGE loop // if n >= 0 goto loop + SUBQ $2, LEN // n -= 2 + JGE loop // if n >= 0 goto loop tail: - ADDQ $2, DX // n += 2 - JLE end // if n <= 0 + ADDQ $2, LEN // n += 2 + JLE end // if n <= 0 // x[i] *= alpha for the last iteration if n is odd. - MOVSD 0(R8)(SI*8), X0 - MULSD X7, X0 - MOVSD X0, 0(R8)(SI*8) + MOVSD 0(X_PTR)(SI*8), X0 + MULSD ALPHA, X0 + MOVSD X0, 0(X_PTR)(SI*8) end: RET diff --git a/asm/f64/scalincto_amd64.s b/asm/f64/scalincto_amd64.s index 25a65662..bb836acc 100644 --- a/asm/f64/scalincto_amd64.s +++ b/asm/f64/scalincto_amd64.s @@ -38,50 +38,61 @@ #include "textflag.h" -// func DscalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) +#define X_PTR R8 +#define DST_PTR R9 +#define LEN DX +#define TAIL BX +#define INC_X R10 +#define INCx3_X R11 +#define INC_DST R11 +#define INCx3_DST R11 +#define ALPHA X7 +#define ALPHA_2 X1 + +// func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) TEXT ·ScalIncTo(SB), NOSPLIT, $0 - MOVQ dst+0(FP), R9 - MOVQ incDst+24(FP), R11 - MOVHPD alpha+32(FP), X7 - MOVLPD alpha+32(FP), X7 - MOVQ x+40(FP), R8 - MOVQ n+64(FP), DX - MOVQ incX+72(FP), R10 + MOVQ dst+0(FP), DST_PTR + MOVQ incDst+24(FP), INC_DST + MOVHPD alpha+32(FP), ALPHA + MOVLPD alpha+32(FP), ALPHA + MOVQ x+40(FP), X_PTR + MOVQ n+64(FP), LEN + MOVQ incX+72(FP), INC_X MOVQ $0, SI MOVQ $0, DI - MOVQ R10, AX // nextX = incX - MOVQ R11, BX // nextDst = incDst - SHLQ $1, R10 // incX *= 2 - SHLQ $1, R11 // incDst *= 2 + MOVQ INC_X, AX // nextX = incX + MOVQ INC_DST, BX // nextDst = incDst + SHLQ $1, INC_X // incX *= 2 + SHLQ $1, INC_DST // incDst *= 2 - SUBQ $2, DX // n -= 2 - JL tail // if n < 0 + SUBQ $2, LEN // n -= 2 + JL tail // if n < 0 loop: // dst[i] = alpha * x[i] unrolled 2x. - MOVHPD 0(R8)(SI*8), X0 - MOVLPD 0(R8)(AX*8), X0 - MULPD X7, X0 - MOVHPD X0, 0(R9)(DI*8) - MOVLPD X0, 0(R9)(BX*8) + MOVHPD 0(X_PTR)(SI*8), X0 + MOVLPD 0(X_PTR)(AX*8), X0 + MULPD ALPHA, X0 + MOVHPD X0, 0(DST_PTR)(DI*8) + MOVLPD X0, 0(DST_PTR)(BX*8) - ADDQ R10, SI // ix += incX - ADDQ R10, AX // nextX += incX - ADDQ R11, DI // idst += incDst - ADDQ R11, BX // nextDst += incDst + ADDQ INC_X, SI // ix += incX + ADDQ INC_X, AX // nextX += incX + ADDQ INC_DST, DI // idst += incDst + ADDQ INC_DST, BX // nextDst += incDst - SUBQ $2, DX // n -= 2 - JGE loop // if n >= 0 goto loop + SUBQ $2, LEN // n -= 2 + JGE loop // if n >= 0 goto loop tail: - ADDQ $2, DX // n += 2 - JLE end // if n <= 0 + ADDQ $2, LEN // n += 2 + JLE end // if n <= 0 // dst[i] = alpha * x[i] for the last iteration if n is odd. - MOVSD 0(R8)(SI*8), X0 - MULSD X7, X0 - MOVSD X0, 0(R9)(DI*8) + MOVSD 0(X_PTR)(SI*8), X0 + MULSD ALPHA, X0 + MOVSD X0, 0(DST_PTR)(DI*8) end: RET diff --git a/asm/f64/scalunitary_amd64.s b/asm/f64/scalunitary_amd64.s index cfb0757b..ff7669b5 100644 --- a/asm/f64/scalunitary_amd64.s +++ b/asm/f64/scalunitary_amd64.s @@ -38,42 +38,50 @@ #include "textflag.h" -// func DscalUnitary(alpha float64, x []float64) -TEXT ·ScalUnitary(SB), NOSPLIT, $0 - MOVHPD alpha+0(FP), X7 - MOVLPD alpha+0(FP), X7 - MOVQ x+8(FP), R8 - MOVQ x_len+16(FP), DI // n = len(x) +#define X_PTR R8 +#define DST_PTR DX +#define IDX SI +#define LEN DI +#define TAIL BX +#define ALPHA X7 +#define ALPHA_2 X1 - MOVQ $0, SI // i = 0 - SUBQ $4, DI // n -= 4 - JL tail // if n < 0 goto tail +// func ScalUnitary(alpha float64, x []float64) +TEXT ·ScalUnitary(SB), NOSPLIT, $0 + MOVHPD alpha+0(FP), ALPHA + MOVLPD alpha+0(FP), ALPHA + MOVQ x+8(FP), X_PTR + MOVQ x_len+16(FP), LEN // n = len(x) + + MOVQ $0, IDX // i = 0 + SUBQ $4, LEN // n -= 4 + JL tail // if n < 0 goto tail loop: // x[i] *= alpha unrolled 4x. - MOVUPD 0(R8)(SI*8), X0 - MOVUPD 16(R8)(SI*8), X1 - MULPD X7, X0 - MULPD X7, X1 - MOVUPD X0, 0(R8)(SI*8) - MOVUPD X1, 16(R8)(SI*8) + MOVUPD 0(X_PTR)(IDX*8), X0 + MOVUPD 16(X_PTR)(IDX*8), X1 + MULPD ALPHA, X0 + MULPD ALPHA, X1 + MOVUPD X0, 0(X_PTR)(IDX*8) + MOVUPD X1, 16(X_PTR)(IDX*8) - ADDQ $4, SI // i += 4 - SUBQ $4, DI // n -= 4 - JGE loop // if n >= 0 goto loop + ADDQ $4, IDX // i += 4 + SUBQ $4, LEN // n -= 4 + JGE loop // if n >= 0 goto loop tail: - ADDQ $4, DI // n += 4 - JZ end // if n == 0 goto end + ADDQ $4, LEN // n += 4 + JZ end // if n == 0 goto end onemore: // x[i] *= alpha for the remaining 1-3 elements. - MOVSD 0(R8)(SI*8), X0 - MULSD X7, X0 - MOVSD X0, 0(R8)(SI*8) + MOVSD 0(X_PTR)(IDX*8), X0 + MULSD ALPHA, X0 + MOVSD X0, 0(X_PTR)(IDX*8) - ADDQ $1, SI // i++ - SUBQ $1, DI // n-- + ADDQ $1, IDX // i++ + SUBQ $1, LEN // n-- JNZ onemore // if n != 0 goto onemore end: diff --git a/asm/f64/scalunitaryto_amd64.s b/asm/f64/scalunitaryto_amd64.s index 773e7f7d..3551984d 100644 --- a/asm/f64/scalunitaryto_amd64.s +++ b/asm/f64/scalunitaryto_amd64.s @@ -38,44 +38,52 @@ #include "textflag.h" -// func DscalUnitaryTo(dst []float64, alpha float64, x []float64) +#define X_PTR R8 +#define DST_PTR R9 +#define IDX SI +#define LEN DI +#define TAIL BX +#define ALPHA X7 +#define ALPHA_2 X1 + +// func ScalUnitaryTo(dst []float64, alpha float64, x []float64) // This function assumes len(dst) >= len(x). TEXT ·ScalUnitaryTo(SB), NOSPLIT, $0 - MOVQ dst+0(FP), R9 - MOVHPD alpha+24(FP), X7 - MOVLPD alpha+24(FP), X7 - MOVQ x+32(FP), R8 - MOVQ x_len+40(FP), DI // n = len(x) + MOVQ dst+0(FP), DST_PTR + MOVHPD alpha+24(FP), ALPHA + MOVLPD alpha+24(FP), ALPHA + MOVQ x+32(FP), X_PTR + MOVQ x_len+40(FP), LEN // n = len(x) - MOVQ $0, SI // i = 0 - SUBQ $4, DI // n -= 4 - JL tail // if n < 0 goto tail + MOVQ $0, IDX // i = 0 + SUBQ $4, LEN // n -= 4 + JL tail // if n < 0 goto tail loop: // dst[i] = alpha * x[i] unrolled 4x. - MOVUPD 0(R8)(SI*8), X0 - MOVUPD 16(R8)(SI*8), X1 - MULPD X7, X0 - MULPD X7, X1 - MOVUPD X0, 0(R9)(SI*8) - MOVUPD X1, 16(R9)(SI*8) + MOVUPD 0(X_PTR)(IDX*8), X0 + MOVUPD 16(X_PTR)(IDX*8), X1 + MULPD ALPHA, X0 + MULPD ALPHA, X1 + MOVUPD X0, 0(DST_PTR)(IDX*8) + MOVUPD X1, 16(DST_PTR)(IDX*8) - ADDQ $4, SI // i += 4 - SUBQ $4, DI // n -= 4 - JGE loop // if n >= 0 goto loop + ADDQ $4, IDX // i += 4 + SUBQ $4, LEN // n -= 4 + JGE loop // if n >= 0 goto loop tail: - ADDQ $4, DI // n += 4 - JZ end // if n == 0 goto end + ADDQ $4, LEN // n += 4 + JZ end // if n == 0 goto end onemore: // dst[i] = alpha * x[i] for the remaining 1-3 elements. - MOVSD 0(R8)(SI*8), X0 - MULSD X7, X0 - MOVSD X0, 0(R9)(SI*8) + MOVSD 0(X_PTR)(IDX*8), X0 + MULSD ALPHA, X0 + MOVSD X0, 0(DST_PTR)(IDX*8) - ADDQ $1, SI // i++ - SUBQ $1, DI // n-- + ADDQ $1, IDX // i++ + SUBQ $1, LEN // n-- JNZ onemore // if n != 0 goto onemore end: From 1f046fc708e28156282c3cf2d251c44e2f47e226 Mon Sep 17 00:00:00 2001 From: Chad Kunde Date: Fri, 28 Apr 2017 00:14:29 -0700 Subject: [PATCH 5/5] asm/f64: Cleand up scal* assembly code. Used 3xIncrement registers to consolidate pointer arithmetic to the end of the main loop. Moved pipelining register calculations to after short-vector test. Added two-wide tail code block. --- asm/f64/scalinc_amd64.s | 92 ++++++++++++++++++----------- asm/f64/scalincto_amd64.s | 108 +++++++++++++++++++++------------- asm/f64/scalunitary_amd64.s | 92 ++++++++++++++++++----------- asm/f64/scalunitaryto_amd64.s | 93 ++++++++++++++++++----------- 4 files changed, 240 insertions(+), 145 deletions(-) diff --git a/asm/f64/scalinc_amd64.s b/asm/f64/scalinc_amd64.s index 386fbcd5..55a5758e 100644 --- a/asm/f64/scalinc_amd64.s +++ b/asm/f64/scalinc_amd64.s @@ -38,52 +38,76 @@ #include "textflag.h" -#define X_PTR R8 -#define DST_PTR R9 -#define LEN DX +#define X_PTR SI +#define LEN CX #define TAIL BX -#define INC_X R10 -#define INCx3_X R11 -#define ALPHA X7 +#define INC_X R8 +#define INCx3_X R9 +#define ALPHA X0 #define ALPHA_2 X1 // func ScalInc(alpha float64, x []float64, n, incX uintptr) TEXT ·ScalInc(SB), NOSPLIT, $0 - MOVHPD alpha+0(FP), ALPHA - MOVLPD alpha+0(FP), ALPHA - MOVQ x+8(FP), X_PTR - MOVQ n+32(FP), LEN - MOVQ incX+40(FP), INC_X + MOVSD alpha+0(FP), ALPHA // ALPHA = alpha + MOVQ x_base+8(FP), X_PTR // X_PTR = &x + MOVQ incX+40(FP), INC_X // INC_X = incX + SHLQ $3, INC_X // INC_X *= sizeof(float64) + MOVQ n+32(FP), LEN // LEN = n + CMPQ LEN, $0 + JE end // if LEN == 0 { return } - MOVQ $0, SI - MOVQ INC_X, AX // nextX = incX - SHLQ $1, INC_X // incX *= 2 + MOVQ LEN, TAIL + ANDQ $3, TAIL // TAIL = LEN % 4 + SHRQ $2, LEN // LEN = floor( LEN / 4 ) + JZ tail_start // if LEN == 0 { goto tail_start } - SUBQ $2, LEN // n -= 2 - JL tail // if n < 0 + MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining + LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 -loop: - // x[i] *= alpha unrolled 2x. - MOVHPD 0(X_PTR)(SI*8), X0 - MOVLPD 0(X_PTR)(AX*8), X0 - MULPD ALPHA, X0 - MOVHPD X0, 0(X_PTR)(SI*8) - MOVLPD X0, 0(X_PTR)(AX*8) +loop: // do { // x[i] *= alpha unrolled 4x. + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MOVSD (X_PTR)(INC_X*2), X4 + MOVSD (X_PTR)(INCx3_X*1), X5 - ADDQ INC_X, SI // ix += incX - ADDQ INC_X, AX // nextX += incX + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA_2, X3 + MULSD ALPHA, X4 + MULSD ALPHA_2, X5 - SUBQ $2, LEN // n -= 2 - JGE loop // if n >= 0 goto loop + MOVSD X2, (X_PTR) // x[i] = X_i + MOVSD X3, (X_PTR)(INC_X*1) + MOVSD X4, (X_PTR)(INC_X*2) + MOVSD X5, (X_PTR)(INCx3_X*1) -tail: - ADDQ $2, LEN // n += 2 - JLE end // if n <= 0 + LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) + DECQ LEN + JNZ loop // } while --LEN > 0 + CMPQ TAIL, $0 + JE end // if TAIL == 0 { return } - // x[i] *= alpha for the last iteration if n is odd. - MOVSD 0(X_PTR)(SI*8), X0 - MULSD ALPHA, X0 - MOVSD X0, 0(X_PTR)(SI*8) +tail_start: // Reset loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( LEN / 2 ) + JZ tail_one + +tail_two: // do { + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA, X3 + MOVSD X2, (X_PTR) // x[i] = X_i + MOVSD X3, (X_PTR)(INC_X*1) + + LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) + + ANDQ $1, TAIL + JZ end + +tail_one: + MOVSD (X_PTR), X2 // X_i = x[i] + MULSD ALPHA, X2 // X_i *= ALPHA + MOVSD X2, (X_PTR) // x[i] = X_i end: RET diff --git a/asm/f64/scalincto_amd64.s b/asm/f64/scalincto_amd64.s index bb836acc..57c90a2a 100644 --- a/asm/f64/scalincto_amd64.s +++ b/asm/f64/scalincto_amd64.s @@ -38,61 +38,85 @@ #include "textflag.h" -#define X_PTR R8 -#define DST_PTR R9 -#define LEN DX +#define X_PTR SI +#define DST_PTR DI +#define LEN CX #define TAIL BX -#define INC_X R10 -#define INCx3_X R11 -#define INC_DST R11 +#define INC_X R8 +#define INCx3_X R9 +#define INC_DST R10 #define INCx3_DST R11 -#define ALPHA X7 +#define ALPHA X0 #define ALPHA_2 X1 // func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) TEXT ·ScalIncTo(SB), NOSPLIT, $0 - MOVQ dst+0(FP), DST_PTR - MOVQ incDst+24(FP), INC_DST - MOVHPD alpha+32(FP), ALPHA - MOVLPD alpha+32(FP), ALPHA - MOVQ x+40(FP), X_PTR - MOVQ n+64(FP), LEN - MOVQ incX+72(FP), INC_X + MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst + MOVQ incDst+24(FP), INC_DST // INC_DST = incDst + SHLQ $3, INC_DST // INC_DST *= sizeof(float64) + MOVSD alpha+32(FP), ALPHA // ALPHA = alpha + MOVQ x_base+40(FP), X_PTR // X_PTR = &x + MOVQ n+64(FP), LEN // LEN = n + MOVQ incX+72(FP), INC_X // INC_X = incX + SHLQ $3, INC_X // INC_X *= sizeof(float64) + CMPQ LEN, $0 + JE end // if LEN == 0 { return } - MOVQ $0, SI - MOVQ $0, DI - MOVQ INC_X, AX // nextX = incX - MOVQ INC_DST, BX // nextDst = incDst - SHLQ $1, INC_X // incX *= 2 - SHLQ $1, INC_DST // incDst *= 2 + MOVQ LEN, TAIL + ANDQ $3, TAIL // TAIL = LEN % 4 + SHRQ $2, LEN // LEN = floor( LEN / 4 ) + JZ tail_start // if LEN == 0 { goto tail_start } - SUBQ $2, LEN // n -= 2 - JL tail // if n < 0 + MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining + LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 + LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3 -loop: - // dst[i] = alpha * x[i] unrolled 2x. - MOVHPD 0(X_PTR)(SI*8), X0 - MOVLPD 0(X_PTR)(AX*8), X0 - MULPD ALPHA, X0 - MOVHPD X0, 0(DST_PTR)(DI*8) - MOVLPD X0, 0(DST_PTR)(BX*8) +loop: // do { // x[i] *= alpha unrolled 4x. + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MOVSD (X_PTR)(INC_X*2), X4 + MOVSD (X_PTR)(INCx3_X*1), X5 - ADDQ INC_X, SI // ix += incX - ADDQ INC_X, AX // nextX += incX - ADDQ INC_DST, DI // idst += incDst - ADDQ INC_DST, BX // nextDst += incDst + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA_2, X3 + MULSD ALPHA, X4 + MULSD ALPHA_2, X5 - SUBQ $2, LEN // n -= 2 - JGE loop // if n >= 0 goto loop + MOVSD X2, (DST_PTR) // dst[i] = X_i + MOVSD X3, (DST_PTR)(INC_DST*1) + MOVSD X4, (DST_PTR)(INC_DST*2) + MOVSD X5, (DST_PTR)(INCx3_DST*1) -tail: - ADDQ $2, LEN // n += 2 - JLE end // if n <= 0 + LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) + LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4]) + DECQ LEN + JNZ loop // } while --LEN > 0 + CMPQ TAIL, $0 + JE end // if TAIL == 0 { return } - // dst[i] = alpha * x[i] for the last iteration if n is odd. - MOVSD 0(X_PTR)(SI*8), X0 - MULSD ALPHA, X0 - MOVSD X0, 0(DST_PTR)(DI*8) +tail_start: // Reset loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( LEN / 2 ) + JZ tail_one + +tail_two: + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA, X3 + MOVSD X2, (DST_PTR) // dst[i] = X_i + MOVSD X3, (DST_PTR)(INC_DST*1) + + LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) + LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incDst*2]) + + ANDQ $1, TAIL + JZ end + +tail_one: + MOVSD (X_PTR), X2 // X_i = x[i] + MULSD ALPHA, X2 // X_i *= ALPHA + MOVSD X2, (DST_PTR) // x[i] = X_i end: RET diff --git a/asm/f64/scalunitary_amd64.s b/asm/f64/scalunitary_amd64.s index ff7669b5..da23af77 100644 --- a/asm/f64/scalunitary_amd64.s +++ b/asm/f64/scalunitary_amd64.s @@ -38,51 +38,75 @@ #include "textflag.h" -#define X_PTR R8 -#define DST_PTR DX -#define IDX SI -#define LEN DI +#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // @ MOVDDUP XMM0, 8[RSP] + +#define X_PTR SI +#define DST_PTR DI +#define IDX AX +#define LEN CX #define TAIL BX -#define ALPHA X7 +#define ALPHA X0 #define ALPHA_2 X1 // func ScalUnitary(alpha float64, x []float64) TEXT ·ScalUnitary(SB), NOSPLIT, $0 - MOVHPD alpha+0(FP), ALPHA - MOVLPD alpha+0(FP), ALPHA - MOVQ x+8(FP), X_PTR - MOVQ x_len+16(FP), LEN // n = len(x) + MOVDDUP_ALPHA // ALPHA = { alpha, alpha } + MOVQ x_base+8(FP), X_PTR // X_PTR = &x + MOVQ x_len+16(FP), LEN // LEN = len(x) + CMPQ LEN, $0 + JE end // if LEN == 0 { return } + XORQ IDX, IDX // IDX = 0 - MOVQ $0, IDX // i = 0 - SUBQ $4, LEN // n -= 4 - JL tail // if n < 0 goto tail + MOVQ LEN, TAIL + ANDQ $7, TAIL // TAIL = LEN % 8 + SHRQ $3, LEN // LEN = floor( LEN / 8 ) + JZ tail_start // if LEN == 0 { goto tail_start } -loop: - // x[i] *= alpha unrolled 4x. - MOVUPD 0(X_PTR)(IDX*8), X0 - MOVUPD 16(X_PTR)(IDX*8), X1 - MULPD ALPHA, X0 - MULPD ALPHA, X1 - MOVUPD X0, 0(X_PTR)(IDX*8) - MOVUPD X1, 16(X_PTR)(IDX*8) + MOVUPS ALPHA, ALPHA_2 - ADDQ $4, IDX // i += 4 - SUBQ $4, LEN // n -= 4 - JGE loop // if n >= 0 goto loop +loop: // do { // x[i] *= alpha unrolled 8x. + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MOVUPS 16(X_PTR)(IDX*8), X3 + MOVUPS 32(X_PTR)(IDX*8), X4 + MOVUPS 48(X_PTR)(IDX*8), X5 -tail: - ADDQ $4, LEN // n += 4 - JZ end // if n == 0 goto end + MULPD ALPHA, X2 // X_i *= ALPHA + MULPD ALPHA_2, X3 + MULPD ALPHA, X4 + MULPD ALPHA_2, X5 -onemore: - // x[i] *= alpha for the remaining 1-3 elements. - MOVSD 0(X_PTR)(IDX*8), X0 - MULSD ALPHA, X0 - MOVSD X0, 0(X_PTR)(IDX*8) + MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i + MOVUPS X3, 16(X_PTR)(IDX*8) + MOVUPS X4, 32(X_PTR)(IDX*8) + MOVUPS X5, 48(X_PTR)(IDX*8) - ADDQ $1, IDX // i++ - SUBQ $1, LEN // n-- - JNZ onemore // if n != 0 goto onemore + ADDQ $8, IDX // i += 8 + DECQ LEN + JNZ loop // while --LEN > 0 + CMPQ TAIL, $0 + JE end // if TAIL == 0 { return } + +tail_start: // Reset loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( TAIL / 2 ) + JZ tail_one // if n == 0 goto end + +tail_two: // do { + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MULPD ALPHA, X2 // X_i *= ALPHA + MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i + ADDQ $2, IDX // i += 2 + DECQ LEN + JNZ tail_two // while --LEN > 0 + + ANDQ $1, TAIL + JZ end // if TAIL == 0 { return } + +tail_one: + // x[i] *= alpha for the remaining element. + MOVSD (X_PTR)(IDX*8), X2 + MULSD ALPHA, X2 + MOVSD X2, (X_PTR)(IDX*8) end: RET diff --git a/asm/f64/scalunitaryto_amd64.s b/asm/f64/scalunitaryto_amd64.s index 3551984d..3685d5b5 100644 --- a/asm/f64/scalunitaryto_amd64.s +++ b/asm/f64/scalunitaryto_amd64.s @@ -38,53 +38,76 @@ #include "textflag.h" -#define X_PTR R8 -#define DST_PTR R9 -#define IDX SI -#define LEN DI +#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x2024 // @ MOVDDUP 32(SP), X0 /*XMM0, 32[RSP]*/ + +#define X_PTR SI +#define DST_PTR DI +#define IDX AX +#define LEN CX #define TAIL BX -#define ALPHA X7 +#define ALPHA X0 #define ALPHA_2 X1 // func ScalUnitaryTo(dst []float64, alpha float64, x []float64) // This function assumes len(dst) >= len(x). TEXT ·ScalUnitaryTo(SB), NOSPLIT, $0 - MOVQ dst+0(FP), DST_PTR - MOVHPD alpha+24(FP), ALPHA - MOVLPD alpha+24(FP), ALPHA - MOVQ x+32(FP), X_PTR - MOVQ x_len+40(FP), LEN // n = len(x) + MOVQ x_base+32(FP), X_PTR // X_PTR = &x + MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst + MOVDDUP_ALPHA // ALPHA = { alpha, alpha } + MOVQ x_len+40(FP), LEN // LEN = len(x) + CMPQ LEN, $0 + JE end // if LEN == 0 { return } - MOVQ $0, IDX // i = 0 - SUBQ $4, LEN // n -= 4 - JL tail // if n < 0 goto tail + XORQ IDX, IDX // IDX = 0 + MOVQ LEN, TAIL + ANDQ $7, TAIL // TAIL = LEN % 8 + SHRQ $3, LEN // LEN = floor( LEN / 8 ) + JZ tail_start // if LEN == 0 { goto tail_start } -loop: - // dst[i] = alpha * x[i] unrolled 4x. - MOVUPD 0(X_PTR)(IDX*8), X0 - MOVUPD 16(X_PTR)(IDX*8), X1 - MULPD ALPHA, X0 - MULPD ALPHA, X1 - MOVUPD X0, 0(DST_PTR)(IDX*8) - MOVUPD X1, 16(DST_PTR)(IDX*8) + MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining - ADDQ $4, IDX // i += 4 - SUBQ $4, LEN // n -= 4 - JGE loop // if n >= 0 goto loop +loop: // do { // dst[i] = alpha * x[i] unrolled 8x. + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MOVUPS 16(X_PTR)(IDX*8), X3 + MOVUPS 32(X_PTR)(IDX*8), X4 + MOVUPS 48(X_PTR)(IDX*8), X5 -tail: - ADDQ $4, LEN // n += 4 - JZ end // if n == 0 goto end + MULPD ALPHA, X2 // X_i *= ALPHA + MULPD ALPHA_2, X3 + MULPD ALPHA, X4 + MULPD ALPHA_2, X5 -onemore: - // dst[i] = alpha * x[i] for the remaining 1-3 elements. - MOVSD 0(X_PTR)(IDX*8), X0 - MULSD ALPHA, X0 - MOVSD X0, 0(DST_PTR)(IDX*8) + MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i + MOVUPS X3, 16(DST_PTR)(IDX*8) + MOVUPS X4, 32(DST_PTR)(IDX*8) + MOVUPS X5, 48(DST_PTR)(IDX*8) - ADDQ $1, IDX // i++ - SUBQ $1, LEN // n-- - JNZ onemore // if n != 0 goto onemore + ADDQ $8, IDX // i += 8 + DECQ LEN + JNZ loop // while --LEN > 0 + CMPQ TAIL, $0 + JE end // if TAIL == 0 { return } + +tail_start: // Reset loop counters + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( TAIL / 2 ) + JZ tail_one // if LEN == 0 { goto tail_one } + +tail_two: // do { + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MULPD ALPHA, X2 // X_i *= ALPHA + MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i + ADDQ $2, IDX // i += 2 + DECQ LEN + JNZ tail_two // while --LEN > 0 + + ANDQ $1, TAIL + JZ end // if TAIL == 0 { return } + +tail_one: + MOVSD (X_PTR)(IDX*8), X2 // X_i = x[i] + MULSD ALPHA, X2 // X_i *= ALPHA + MOVSD X2, (DST_PTR)(IDX*8) // dst[i] = X_i end: RET