Merge pull request #53 from Kunde21/f64Scal

asm/f64: Updated scal assembly to wide, pipelined loops.
This commit is contained in:
Chad Kunde
2017-05-19 02:53:56 -07:00
committed by GitHub
8 changed files with 594 additions and 375 deletions

View File

@@ -140,6 +140,7 @@ func same(a, b float64) bool {
}
var ( // Offset sets for testing alignment handling in Unitary assembly functions.
align1 = []int{0, 1}
align2 = newIncSet(0, 1)
align3 = newIncToSet(0, 1)
)
@@ -190,3 +191,14 @@ func randomSlice(n, inc int) []float64 {
}
return x
}
func randSlice(n, inc int, r *rand.Rand) []float64 {
if inc < 0 {
inc = -inc
}
x := make([]float64, (n-1)*inc+1)
for i := range x {
x[i] = r.Float64()
}
return x
}

87
asm/f64/benchScal_test.go Normal file
View File

@@ -0,0 +1,87 @@
// Copyright ©2017 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build go1.7
package f64
import (
"fmt"
"testing"
)
var uniScal = []int64{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4}
func BenchmarkScalUnitary(t *testing.B) {
tstName := "ScalUnitary"
for _, ln := range uniScal {
t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) {
b.SetBytes(64 * ln)
x := x[:ln]
b.ResetTimer()
for i := 0; i < b.N; i++ {
ScalUnitary(a, x)
}
})
}
}
func BenchmarkScalUnitaryTo(t *testing.B) {
tstName := "ScalUnitaryTo"
for _, ln := range uniScal {
t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) {
b.SetBytes(int64(64 * ln))
x, y := x[:ln], y[:ln]
b.ResetTimer()
for i := 0; i < b.N; i++ {
ScalUnitaryTo(y, a, x)
}
})
}
}
var incScal = []struct {
len uintptr
inc []int
}{
{1, []int{1}},
{3, []int{1, 2, 4, 10}},
{10, []int{1, 2, 4, 10}},
{30, []int{1, 2, 4, 10}},
{1e2, []int{1, 2, 4, 10}},
{3e2, []int{1, 2, 4, 10}},
{1e3, []int{1, 2, 4, 10}},
{3e3, []int{1, 2, 4, 10}},
{1e4, []int{1, 2, 4, 10}},
}
func BenchmarkScalInc(t *testing.B) {
tstName := "ScalInc"
for _, tt := range incScal {
for _, inc := range tt.inc {
t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) {
b.SetBytes(int64(64 * tt.len))
tstInc := uintptr(inc)
for i := 0; i < b.N; i++ {
ScalInc(a, x, uintptr(tt.len), tstInc)
}
})
}
}
}
func BenchmarkScalIncTo(t *testing.B) {
tstName := "ScalIncTo"
for _, tt := range incScal {
for _, inc := range tt.inc {
t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) {
b.SetBytes(int64(64 * tt.len))
tstInc := uintptr(inc)
for i := 0; i < b.N; i++ {
ScalIncTo(z, tstInc, a, x, uintptr(tt.len), tstInc)
}
})
}
}
}

View File

@@ -6,7 +6,10 @@
package f64
import "testing"
import (
"math/rand"
"testing"
)
var (
a = float64(2)
@@ -286,3 +289,150 @@ func BenchmarkLF64AxpyIncToN100000IncM1(b *testing.B) { benchaxpyincto(b, 10000
func BenchmarkLF64AxpyIncToN100000IncM2(b *testing.B) { benchaxpyincto(b, 100000, -2, naiveaxpyincto) }
func BenchmarkLF64AxpyIncToN100000IncM4(b *testing.B) { benchaxpyincto(b, 100000, -4, naiveaxpyincto) }
func BenchmarkLF64AxpyIncToN100000IncM10(b *testing.B) { benchaxpyincto(b, 100000, -10, naiveaxpyincto) }
// Scal* benchmarks
func BenchmarkDscalUnitaryN1(b *testing.B) { benchmarkDscalUnitary(b, 1) }
func BenchmarkDscalUnitaryN2(b *testing.B) { benchmarkDscalUnitary(b, 2) }
func BenchmarkDscalUnitaryN3(b *testing.B) { benchmarkDscalUnitary(b, 3) }
func BenchmarkDscalUnitaryN4(b *testing.B) { benchmarkDscalUnitary(b, 4) }
func BenchmarkDscalUnitaryN10(b *testing.B) { benchmarkDscalUnitary(b, 10) }
func BenchmarkDscalUnitaryN100(b *testing.B) { benchmarkDscalUnitary(b, 100) }
func BenchmarkDscalUnitaryN1000(b *testing.B) { benchmarkDscalUnitary(b, 1000) }
func BenchmarkDscalUnitaryN10000(b *testing.B) { benchmarkDscalUnitary(b, 10000) }
func BenchmarkDscalUnitaryN100000(b *testing.B) { benchmarkDscalUnitary(b, 100000) }
func benchmarkDscalUnitary(b *testing.B, n int) {
x := randomSlice(n, 1)
b.ResetTimer()
for i := 0; i < b.N; i += 2 {
ScalUnitary(2, x)
ScalUnitary(0.5, x)
}
benchSink = x
}
func BenchmarkDscalUnitaryToN1(b *testing.B) { benchmarkDscalUnitaryTo(b, 1) }
func BenchmarkDscalUnitaryToN2(b *testing.B) { benchmarkDscalUnitaryTo(b, 2) }
func BenchmarkDscalUnitaryToN3(b *testing.B) { benchmarkDscalUnitaryTo(b, 3) }
func BenchmarkDscalUnitaryToN4(b *testing.B) { benchmarkDscalUnitaryTo(b, 4) }
func BenchmarkDscalUnitaryToN10(b *testing.B) { benchmarkDscalUnitaryTo(b, 10) }
func BenchmarkDscalUnitaryToN100(b *testing.B) { benchmarkDscalUnitaryTo(b, 100) }
func BenchmarkDscalUnitaryToN1000(b *testing.B) { benchmarkDscalUnitaryTo(b, 1000) }
func BenchmarkDscalUnitaryToN10000(b *testing.B) { benchmarkDscalUnitaryTo(b, 10000) }
func BenchmarkDscalUnitaryToN100000(b *testing.B) { benchmarkDscalUnitaryTo(b, 100000) }
func benchmarkDscalUnitaryTo(b *testing.B, n int) {
x := randomSlice(n, 1)
dst := randomSlice(n, 1)
a := rand.Float64()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ScalUnitaryTo(dst, a, x)
}
benchSink = dst
}
func BenchmarkDscalUnitaryToXN1(b *testing.B) { benchmarkDscalUnitaryToX(b, 1) }
func BenchmarkDscalUnitaryToXN2(b *testing.B) { benchmarkDscalUnitaryToX(b, 2) }
func BenchmarkDscalUnitaryToXN3(b *testing.B) { benchmarkDscalUnitaryToX(b, 3) }
func BenchmarkDscalUnitaryToXN4(b *testing.B) { benchmarkDscalUnitaryToX(b, 4) }
func BenchmarkDscalUnitaryToXN10(b *testing.B) { benchmarkDscalUnitaryToX(b, 10) }
func BenchmarkDscalUnitaryToXN100(b *testing.B) { benchmarkDscalUnitaryToX(b, 100) }
func BenchmarkDscalUnitaryToXN1000(b *testing.B) { benchmarkDscalUnitaryToX(b, 1000) }
func BenchmarkDscalUnitaryToXN10000(b *testing.B) { benchmarkDscalUnitaryToX(b, 10000) }
func BenchmarkDscalUnitaryToXN100000(b *testing.B) { benchmarkDscalUnitaryToX(b, 100000) }
func benchmarkDscalUnitaryToX(b *testing.B, n int) {
x := randomSlice(n, 1)
b.ResetTimer()
for i := 0; i < b.N; i += 2 {
ScalUnitaryTo(x, 2, x)
ScalUnitaryTo(x, 0.5, x)
}
benchSink = x
}
func BenchmarkDscalIncN1Inc1(b *testing.B) { benchmarkDscalInc(b, 1, 1) }
func BenchmarkDscalIncN2Inc1(b *testing.B) { benchmarkDscalInc(b, 2, 1) }
func BenchmarkDscalIncN2Inc2(b *testing.B) { benchmarkDscalInc(b, 2, 2) }
func BenchmarkDscalIncN2Inc4(b *testing.B) { benchmarkDscalInc(b, 2, 4) }
func BenchmarkDscalIncN2Inc10(b *testing.B) { benchmarkDscalInc(b, 2, 10) }
func BenchmarkDscalIncN3Inc1(b *testing.B) { benchmarkDscalInc(b, 3, 1) }
func BenchmarkDscalIncN3Inc2(b *testing.B) { benchmarkDscalInc(b, 3, 2) }
func BenchmarkDscalIncN3Inc4(b *testing.B) { benchmarkDscalInc(b, 3, 4) }
func BenchmarkDscalIncN3Inc10(b *testing.B) { benchmarkDscalInc(b, 3, 10) }
func BenchmarkDscalIncN4Inc1(b *testing.B) { benchmarkDscalInc(b, 4, 1) }
func BenchmarkDscalIncN4Inc2(b *testing.B) { benchmarkDscalInc(b, 4, 2) }
func BenchmarkDscalIncN4Inc4(b *testing.B) { benchmarkDscalInc(b, 4, 4) }
func BenchmarkDscalIncN4Inc10(b *testing.B) { benchmarkDscalInc(b, 4, 10) }
func BenchmarkDscalIncN10Inc1(b *testing.B) { benchmarkDscalInc(b, 10, 1) }
func BenchmarkDscalIncN10Inc2(b *testing.B) { benchmarkDscalInc(b, 10, 2) }
func BenchmarkDscalIncN10Inc4(b *testing.B) { benchmarkDscalInc(b, 10, 4) }
func BenchmarkDscalIncN10Inc10(b *testing.B) { benchmarkDscalInc(b, 10, 10) }
func BenchmarkDscalIncN1000Inc1(b *testing.B) { benchmarkDscalInc(b, 1000, 1) }
func BenchmarkDscalIncN1000Inc2(b *testing.B) { benchmarkDscalInc(b, 1000, 2) }
func BenchmarkDscalIncN1000Inc4(b *testing.B) { benchmarkDscalInc(b, 1000, 4) }
func BenchmarkDscalIncN1000Inc10(b *testing.B) { benchmarkDscalInc(b, 1000, 10) }
func BenchmarkDscalIncN100000Inc1(b *testing.B) { benchmarkDscalInc(b, 100000, 1) }
func BenchmarkDscalIncN100000Inc2(b *testing.B) { benchmarkDscalInc(b, 100000, 2) }
func BenchmarkDscalIncN100000Inc4(b *testing.B) { benchmarkDscalInc(b, 100000, 4) }
func BenchmarkDscalIncN100000Inc10(b *testing.B) { benchmarkDscalInc(b, 100000, 10) }
func benchmarkDscalInc(b *testing.B, n, inc int) {
x := randomSlice(n, inc)
b.ResetTimer()
for i := 0; i < b.N; i += 2 {
ScalInc(2, x, uintptr(n), uintptr(inc))
ScalInc(0.5, x, uintptr(n), uintptr(inc))
}
benchSink = x
}
func BenchmarkDscalIncToN1Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1, 1) }
func BenchmarkDscalIncToN2Inc1(b *testing.B) { benchmarkDscalIncTo(b, 2, 1) }
func BenchmarkDscalIncToN2Inc2(b *testing.B) { benchmarkDscalIncTo(b, 2, 2) }
func BenchmarkDscalIncToN2Inc4(b *testing.B) { benchmarkDscalIncTo(b, 2, 4) }
func BenchmarkDscalIncToN2Inc10(b *testing.B) { benchmarkDscalIncTo(b, 2, 10) }
func BenchmarkDscalIncToN3Inc1(b *testing.B) { benchmarkDscalIncTo(b, 3, 1) }
func BenchmarkDscalIncToN3Inc2(b *testing.B) { benchmarkDscalIncTo(b, 3, 2) }
func BenchmarkDscalIncToN3Inc4(b *testing.B) { benchmarkDscalIncTo(b, 3, 4) }
func BenchmarkDscalIncToN3Inc10(b *testing.B) { benchmarkDscalIncTo(b, 3, 10) }
func BenchmarkDscalIncToN4Inc1(b *testing.B) { benchmarkDscalIncTo(b, 4, 1) }
func BenchmarkDscalIncToN4Inc2(b *testing.B) { benchmarkDscalIncTo(b, 4, 2) }
func BenchmarkDscalIncToN4Inc4(b *testing.B) { benchmarkDscalIncTo(b, 4, 4) }
func BenchmarkDscalIncToN4Inc10(b *testing.B) { benchmarkDscalIncTo(b, 4, 10) }
func BenchmarkDscalIncToN10Inc1(b *testing.B) { benchmarkDscalIncTo(b, 10, 1) }
func BenchmarkDscalIncToN10Inc2(b *testing.B) { benchmarkDscalIncTo(b, 10, 2) }
func BenchmarkDscalIncToN10Inc4(b *testing.B) { benchmarkDscalIncTo(b, 10, 4) }
func BenchmarkDscalIncToN10Inc10(b *testing.B) { benchmarkDscalIncTo(b, 10, 10) }
func BenchmarkDscalIncToN1000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1000, 1) }
func BenchmarkDscalIncToN1000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 1000, 2) }
func BenchmarkDscalIncToN1000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 1000, 4) }
func BenchmarkDscalIncToN1000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 1000, 10) }
func BenchmarkDscalIncToN100000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 100000, 1) }
func BenchmarkDscalIncToN100000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 100000, 2) }
func BenchmarkDscalIncToN100000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 100000, 4) }
func BenchmarkDscalIncToN100000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 100000, 10) }
func benchmarkDscalIncTo(b *testing.B, n, inc int) {
x := randomSlice(n, inc)
dst := randomSlice(n, inc)
a := rand.Float64()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ScalIncTo(dst, uintptr(inc), a, x, uintptr(n), uintptr(inc))
}
benchSink = dst
}

View File

@@ -10,11 +10,16 @@ import (
"testing"
)
var dscalTests = []struct {
var scalTests = []struct {
alpha float64
x []float64
want []float64
}{
{
alpha: 0,
x: []float64{},
want: []float64{},
},
{
alpha: 0,
x: []float64{1},
@@ -60,281 +65,115 @@ var dscalTests = []struct {
x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9},
want: []float64{0, 2, -4, 6, 8, -10, 12, -14, 16, 18},
},
{
alpha: 3,
x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9, 12},
want: []float64{0, 3, -6, 9, 12, -15, 18, -21, 24, 27, 36},
},
}
func TestDscalUnitary(t *testing.T) {
for i, test := range dscalTests {
const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
func TestScalUnitary(t *testing.T) {
const xGdVal = -0.5
for i, test := range scalTests {
for _, align := range align1 {
prefix := fmt.Sprintf("Test %v (x:%v)", i, align)
xgLn := 4 + align
xg := guardVector(test.x, xGdVal, xgLn)
x := xg[xgLn : len(xg)-xgLn]
prefix := fmt.Sprintf("test %v (x*=a)", i)
x, xFront, xBack := newGuardedVector(test.x, 1)
ScalUnitary(test.alpha, x)
ScalUnitary(test.alpha, x)
if !allNaN(xFront) || !allNaN(xBack) {
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
}
if !equalStrided(test.want, x, 1) {
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x)
for i := range test.want {
if !same(x[i], test.want[i]) {
t.Errorf(msgVal, prefix, i, x[i], test.want[i])
}
}
if !isValidGuard(xg, xGdVal, xgLn) {
t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:])
}
}
}
}
func TestDscalUnitaryTo(t *testing.T) {
for i, test := range dscalTests {
const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
func TestScalUnitaryTo(t *testing.T) {
const xGdVal, dstGdVal = -1, 0.5
rng := rand.New(rand.NewSource(42))
for i, test := range scalTests {
n := len(test.x)
for _, align := range align2 {
prefix := fmt.Sprintf("Test %v (x:%v dst:%v)", i, align.x, align.y)
xgLn, dgLn := 4+align.x, 4+align.y
xg := guardVector(test.x, xGdVal, xgLn)
dg := guardVector(randSlice(n, 1, rng), dstGdVal, dgLn)
x, dst := xg[xgLn:len(xg)-xgLn], dg[dgLn:len(dg)-dgLn]
// Test dst = alpha * x.
prefix := fmt.Sprintf("test %v (dst=a*x)", i)
x, xFront, xBack := newGuardedVector(test.x, 1)
dst, dstFront, dstBack := newGuardedVector(test.x, 1)
ScalUnitaryTo(dst, test.alpha, x)
ScalUnitaryTo(dst, test.alpha, x)
if !allNaN(xFront) || !allNaN(xBack) {
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
}
if !allNaN(dstFront) || !allNaN(dstBack) {
t.Errorf(msgGuard, prefix, "dst", dstFront, dstBack)
}
if !equalStrided(test.x, x, 1) {
t.Errorf("%v: modified read-only x argument", prefix)
}
if !equalStrided(test.want, dst, 1) {
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, dst)
}
// Test x = alpha * x.
prefix = fmt.Sprintf("test %v (x=a*x)", i)
x, xFront, xBack = newGuardedVector(test.x, 1)
ScalUnitaryTo(x, test.alpha, x)
if !allNaN(xFront) || !allNaN(xBack) {
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
}
if !equalStrided(test.want, x, 1) {
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x)
for i := range test.want {
if !same(dst[i], test.want[i]) {
t.Errorf(msgVal, prefix, i, dst[i], test.want[i])
}
}
if !isValidGuard(xg, xGdVal, xgLn) {
t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:])
}
if !isValidGuard(dg, dstGdVal, dgLn) {
t.Errorf(msgGuard, prefix, "y", dg[:dgLn], dg[len(dg)-dgLn:])
}
if !equalStrided(test.x, x, 1) {
t.Errorf("%v: modified read-only x argument", prefix)
}
}
}
}
func TestDscalInc(t *testing.T) {
const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
for i, test := range dscalTests {
func TestScalInc(t *testing.T) {
const xGdVal = -0.5
gdLn := 4
for i, test := range scalTests {
n := len(test.x)
for _, incX := range []int{1, 2, 3, 4, 7, 10} {
prefix := fmt.Sprintf("test %v (x*=a), incX = %v", i, incX)
x, xFront, xBack := newGuardedVector(test.x, incX)
prefix := fmt.Sprintf("Test %v (x:%v)", i, incX)
xg := guardIncVector(test.x, xGdVal, incX, gdLn)
x := xg[gdLn : len(xg)-gdLn]
ScalInc(test.alpha, x, uintptr(n), uintptr(incX))
if !allNaN(xFront) || !allNaN(xBack) {
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
}
if nonStridedWrite(x, incX) {
t.Errorf("%v: modified x argument at non-stride position", prefix)
}
if !equalStrided(test.want, x, incX) {
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x)
for i := range test.want {
if !same(x[i*incX], test.want[i]) {
t.Errorf(msgVal, prefix, i, x[i*incX], test.want[i])
}
}
checkValidIncGuard(t, xg, xGdVal, incX, gdLn)
}
}
}
func TestDscalIncTo(t *testing.T) {
const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
for i, test := range dscalTests {
func TestScalIncTo(t *testing.T) {
const xGdVal, dstGdVal = -1, 0.5
gdLn := 4
rng := rand.New(rand.NewSource(42))
for i, test := range scalTests {
n := len(test.x)
for _, inc := range newIncSet(1, 2, 3, 4, 7, 10) {
prefix := fmt.Sprintf("test %v (x:%v dst:%v)", i, inc.x, inc.y)
xg := guardIncVector(test.x, xGdVal, inc.x, gdLn)
dg := guardIncVector(randSlice(n, 1, rng), dstGdVal, inc.y, gdLn)
x, dst := xg[gdLn:len(xg)-gdLn], dg[gdLn:len(dg)-gdLn]
for _, incX := range []int{1, 2, 3, 4, 7, 10} {
// Test x = alpha * x.
prefix := fmt.Sprintf("test %v (x=a*x), incX = %v", i, incX)
x, xFront, xBack := newGuardedVector(test.x, incX)
ScalIncTo(x, uintptr(incX), test.alpha, x, uintptr(n), uintptr(incX))
ScalIncTo(dst, uintptr(inc.y), test.alpha, x, uintptr(n), uintptr(inc.x))
if !allNaN(xFront) || !allNaN(xBack) {
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
}
if nonStridedWrite(x, incX) {
t.Errorf("%v: modified x argument at non-stride position", prefix)
}
if !equalStrided(test.want, x, incX) {
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x)
}
for _, incDst := range []int{1, 2, 3, 4, 7, 10} {
// Test dst = alpha * x.
prefix = fmt.Sprintf("test %v (dst=a*x), incX = %v, incDst = %v", i, incX, incDst)
x, xFront, xBack = newGuardedVector(test.x, incX)
dst, dstFront, dstBack := newGuardedVector(test.x, incDst)
ScalIncTo(dst, uintptr(incDst), test.alpha, x, uintptr(n), uintptr(incX))
if !allNaN(xFront) || !allNaN(xBack) {
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
}
if !allNaN(dstFront) || !allNaN(dstBack) {
t.Errorf(msgGuard, prefix, "dst", dstFront, dstBack)
}
if nonStridedWrite(x, incX) || !equalStrided(test.x, x, incX) {
t.Errorf("%v: modified read-only x argument", prefix)
}
if nonStridedWrite(dst, incDst) {
t.Errorf("%v: modified dst argument at non-stride position", prefix)
}
if !equalStrided(test.want, dst, incDst) {
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, dst)
for i := range test.want {
if !same(dst[i*inc.y], test.want[i]) {
t.Errorf(msgVal, prefix, i, dst[i*inc.y], test.want[i])
}
}
checkValidIncGuard(t, xg, xGdVal, inc.x, gdLn)
checkValidIncGuard(t, dg, dstGdVal, inc.y, gdLn)
if !equalStrided(test.x, x, inc.x) {
t.Errorf("%v: modified read-only x argument", prefix)
}
}
}
}
func BenchmarkDscalUnitaryN1(b *testing.B) { benchmarkDscalUnitary(b, 1) }
func BenchmarkDscalUnitaryN2(b *testing.B) { benchmarkDscalUnitary(b, 2) }
func BenchmarkDscalUnitaryN3(b *testing.B) { benchmarkDscalUnitary(b, 3) }
func BenchmarkDscalUnitaryN4(b *testing.B) { benchmarkDscalUnitary(b, 4) }
func BenchmarkDscalUnitaryN10(b *testing.B) { benchmarkDscalUnitary(b, 10) }
func BenchmarkDscalUnitaryN100(b *testing.B) { benchmarkDscalUnitary(b, 100) }
func BenchmarkDscalUnitaryN1000(b *testing.B) { benchmarkDscalUnitary(b, 1000) }
func BenchmarkDscalUnitaryN10000(b *testing.B) { benchmarkDscalUnitary(b, 10000) }
func BenchmarkDscalUnitaryN100000(b *testing.B) { benchmarkDscalUnitary(b, 100000) }
func benchmarkDscalUnitary(b *testing.B, n int) {
x := randomSlice(n, 1)
b.ResetTimer()
for i := 0; i < b.N; i += 2 {
ScalUnitary(2, x)
ScalUnitary(0.5, x)
}
benchSink = x
}
func BenchmarkDscalUnitaryToN1(b *testing.B) { benchmarkDscalUnitaryTo(b, 1) }
func BenchmarkDscalUnitaryToN2(b *testing.B) { benchmarkDscalUnitaryTo(b, 2) }
func BenchmarkDscalUnitaryToN3(b *testing.B) { benchmarkDscalUnitaryTo(b, 3) }
func BenchmarkDscalUnitaryToN4(b *testing.B) { benchmarkDscalUnitaryTo(b, 4) }
func BenchmarkDscalUnitaryToN10(b *testing.B) { benchmarkDscalUnitaryTo(b, 10) }
func BenchmarkDscalUnitaryToN100(b *testing.B) { benchmarkDscalUnitaryTo(b, 100) }
func BenchmarkDscalUnitaryToN1000(b *testing.B) { benchmarkDscalUnitaryTo(b, 1000) }
func BenchmarkDscalUnitaryToN10000(b *testing.B) { benchmarkDscalUnitaryTo(b, 10000) }
func BenchmarkDscalUnitaryToN100000(b *testing.B) { benchmarkDscalUnitaryTo(b, 100000) }
func benchmarkDscalUnitaryTo(b *testing.B, n int) {
x := randomSlice(n, 1)
dst := randomSlice(n, 1)
a := rand.Float64()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ScalUnitaryTo(dst, a, x)
}
benchSink = dst
}
func BenchmarkDscalUnitaryToXN1(b *testing.B) { benchmarkDscalUnitaryToX(b, 1) }
func BenchmarkDscalUnitaryToXN2(b *testing.B) { benchmarkDscalUnitaryToX(b, 2) }
func BenchmarkDscalUnitaryToXN3(b *testing.B) { benchmarkDscalUnitaryToX(b, 3) }
func BenchmarkDscalUnitaryToXN4(b *testing.B) { benchmarkDscalUnitaryToX(b, 4) }
func BenchmarkDscalUnitaryToXN10(b *testing.B) { benchmarkDscalUnitaryToX(b, 10) }
func BenchmarkDscalUnitaryToXN100(b *testing.B) { benchmarkDscalUnitaryToX(b, 100) }
func BenchmarkDscalUnitaryToXN1000(b *testing.B) { benchmarkDscalUnitaryToX(b, 1000) }
func BenchmarkDscalUnitaryToXN10000(b *testing.B) { benchmarkDscalUnitaryToX(b, 10000) }
func BenchmarkDscalUnitaryToXN100000(b *testing.B) { benchmarkDscalUnitaryToX(b, 100000) }
func benchmarkDscalUnitaryToX(b *testing.B, n int) {
x := randomSlice(n, 1)
b.ResetTimer()
for i := 0; i < b.N; i += 2 {
ScalUnitaryTo(x, 2, x)
ScalUnitaryTo(x, 0.5, x)
}
benchSink = x
}
func BenchmarkDscalIncN1Inc1(b *testing.B) { benchmarkDscalInc(b, 1, 1) }
func BenchmarkDscalIncN2Inc1(b *testing.B) { benchmarkDscalInc(b, 2, 1) }
func BenchmarkDscalIncN2Inc2(b *testing.B) { benchmarkDscalInc(b, 2, 2) }
func BenchmarkDscalIncN2Inc4(b *testing.B) { benchmarkDscalInc(b, 2, 4) }
func BenchmarkDscalIncN2Inc10(b *testing.B) { benchmarkDscalInc(b, 2, 10) }
func BenchmarkDscalIncN3Inc1(b *testing.B) { benchmarkDscalInc(b, 3, 1) }
func BenchmarkDscalIncN3Inc2(b *testing.B) { benchmarkDscalInc(b, 3, 2) }
func BenchmarkDscalIncN3Inc4(b *testing.B) { benchmarkDscalInc(b, 3, 4) }
func BenchmarkDscalIncN3Inc10(b *testing.B) { benchmarkDscalInc(b, 3, 10) }
func BenchmarkDscalIncN4Inc1(b *testing.B) { benchmarkDscalInc(b, 4, 1) }
func BenchmarkDscalIncN4Inc2(b *testing.B) { benchmarkDscalInc(b, 4, 2) }
func BenchmarkDscalIncN4Inc4(b *testing.B) { benchmarkDscalInc(b, 4, 4) }
func BenchmarkDscalIncN4Inc10(b *testing.B) { benchmarkDscalInc(b, 4, 10) }
func BenchmarkDscalIncN10Inc1(b *testing.B) { benchmarkDscalInc(b, 10, 1) }
func BenchmarkDscalIncN10Inc2(b *testing.B) { benchmarkDscalInc(b, 10, 2) }
func BenchmarkDscalIncN10Inc4(b *testing.B) { benchmarkDscalInc(b, 10, 4) }
func BenchmarkDscalIncN10Inc10(b *testing.B) { benchmarkDscalInc(b, 10, 10) }
func BenchmarkDscalIncN1000Inc1(b *testing.B) { benchmarkDscalInc(b, 1000, 1) }
func BenchmarkDscalIncN1000Inc2(b *testing.B) { benchmarkDscalInc(b, 1000, 2) }
func BenchmarkDscalIncN1000Inc4(b *testing.B) { benchmarkDscalInc(b, 1000, 4) }
func BenchmarkDscalIncN1000Inc10(b *testing.B) { benchmarkDscalInc(b, 1000, 10) }
func BenchmarkDscalIncN100000Inc1(b *testing.B) { benchmarkDscalInc(b, 100000, 1) }
func BenchmarkDscalIncN100000Inc2(b *testing.B) { benchmarkDscalInc(b, 100000, 2) }
func BenchmarkDscalIncN100000Inc4(b *testing.B) { benchmarkDscalInc(b, 100000, 4) }
func BenchmarkDscalIncN100000Inc10(b *testing.B) { benchmarkDscalInc(b, 100000, 10) }
func benchmarkDscalInc(b *testing.B, n, inc int) {
x := randomSlice(n, inc)
b.ResetTimer()
for i := 0; i < b.N; i += 2 {
ScalInc(2, x, uintptr(n), uintptr(inc))
ScalInc(0.5, x, uintptr(n), uintptr(inc))
}
benchSink = x
}
func BenchmarkDscalIncToN1Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1, 1) }
func BenchmarkDscalIncToN2Inc1(b *testing.B) { benchmarkDscalIncTo(b, 2, 1) }
func BenchmarkDscalIncToN2Inc2(b *testing.B) { benchmarkDscalIncTo(b, 2, 2) }
func BenchmarkDscalIncToN2Inc4(b *testing.B) { benchmarkDscalIncTo(b, 2, 4) }
func BenchmarkDscalIncToN2Inc10(b *testing.B) { benchmarkDscalIncTo(b, 2, 10) }
func BenchmarkDscalIncToN3Inc1(b *testing.B) { benchmarkDscalIncTo(b, 3, 1) }
func BenchmarkDscalIncToN3Inc2(b *testing.B) { benchmarkDscalIncTo(b, 3, 2) }
func BenchmarkDscalIncToN3Inc4(b *testing.B) { benchmarkDscalIncTo(b, 3, 4) }
func BenchmarkDscalIncToN3Inc10(b *testing.B) { benchmarkDscalIncTo(b, 3, 10) }
func BenchmarkDscalIncToN4Inc1(b *testing.B) { benchmarkDscalIncTo(b, 4, 1) }
func BenchmarkDscalIncToN4Inc2(b *testing.B) { benchmarkDscalIncTo(b, 4, 2) }
func BenchmarkDscalIncToN4Inc4(b *testing.B) { benchmarkDscalIncTo(b, 4, 4) }
func BenchmarkDscalIncToN4Inc10(b *testing.B) { benchmarkDscalIncTo(b, 4, 10) }
func BenchmarkDscalIncToN10Inc1(b *testing.B) { benchmarkDscalIncTo(b, 10, 1) }
func BenchmarkDscalIncToN10Inc2(b *testing.B) { benchmarkDscalIncTo(b, 10, 2) }
func BenchmarkDscalIncToN10Inc4(b *testing.B) { benchmarkDscalIncTo(b, 10, 4) }
func BenchmarkDscalIncToN10Inc10(b *testing.B) { benchmarkDscalIncTo(b, 10, 10) }
func BenchmarkDscalIncToN1000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1000, 1) }
func BenchmarkDscalIncToN1000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 1000, 2) }
func BenchmarkDscalIncToN1000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 1000, 4) }
func BenchmarkDscalIncToN1000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 1000, 10) }
func BenchmarkDscalIncToN100000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 100000, 1) }
func BenchmarkDscalIncToN100000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 100000, 2) }
func BenchmarkDscalIncToN100000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 100000, 4) }
func BenchmarkDscalIncToN100000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 100000, 10) }
func benchmarkDscalIncTo(b *testing.B, n, inc int) {
x := randomSlice(n, inc)
dst := randomSlice(n, inc)
a := rand.Float64()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ScalIncTo(dst, uintptr(inc), a, x, uintptr(n), uintptr(inc))
}
benchSink = dst
}

View File

@@ -38,43 +38,76 @@
#include "textflag.h"
// func DscalInc(alpha float64, x []float64, n, incX uintptr)
#define X_PTR SI
#define LEN CX
#define TAIL BX
#define INC_X R8
#define INCx3_X R9
#define ALPHA X0
#define ALPHA_2 X1
// func ScalInc(alpha float64, x []float64, n, incX uintptr)
TEXT ·ScalInc(SB), NOSPLIT, $0
MOVHPD alpha+0(FP), X7
MOVLPD alpha+0(FP), X7
MOVQ x+8(FP), R8
MOVQ n+32(FP), DX
MOVQ incX+40(FP), R10
MOVSD alpha+0(FP), ALPHA // ALPHA = alpha
MOVQ x_base+8(FP), X_PTR // X_PTR = &x
MOVQ incX+40(FP), INC_X // INC_X = incX
SHLQ $3, INC_X // INC_X *= sizeof(float64)
MOVQ n+32(FP), LEN // LEN = n
CMPQ LEN, $0
JE end // if LEN == 0 { return }
MOVQ $0, SI
MOVQ R10, AX // nextX = incX
SHLQ $1, R10 // incX *= 2
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ tail_start // if LEN == 0 { goto tail_start }
SUBQ $2, DX // n -= 2
JL tail // if n < 0
MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
loop:
// x[i] *= alpha unrolled 2x.
MOVHPD 0(R8)(SI*8), X0
MOVLPD 0(R8)(AX*8), X0
MULPD X7, X0
MOVHPD X0, 0(R8)(SI*8)
MOVLPD X0, 0(R8)(AX*8)
loop: // do { // x[i] *= alpha unrolled 4x.
MOVSD (X_PTR), X2 // X_i = x[i]
MOVSD (X_PTR)(INC_X*1), X3
MOVSD (X_PTR)(INC_X*2), X4
MOVSD (X_PTR)(INCx3_X*1), X5
ADDQ R10, SI // ix += incX
ADDQ R10, AX // nextX += incX
MULSD ALPHA, X2 // X_i *= a
MULSD ALPHA_2, X3
MULSD ALPHA, X4
MULSD ALPHA_2, X5
SUBQ $2, DX // n -= 2
JGE loop // if n >= 0 goto loop
MOVSD X2, (X_PTR) // x[i] = X_i
MOVSD X3, (X_PTR)(INC_X*1)
MOVSD X4, (X_PTR)(INC_X*2)
MOVSD X5, (X_PTR)(INCx3_X*1)
tail:
ADDQ $2, DX // n += 2
JLE end // if n <= 0
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
DECQ LEN
JNZ loop // } while --LEN > 0
CMPQ TAIL, $0
JE end // if TAIL == 0 { return }
// x[i] *= alpha for the last iteration if n is odd.
MOVSD 0(R8)(SI*8), X0
MULSD X7, X0
MOVSD X0, 0(R8)(SI*8)
tail_start: // Reset loop registers
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
SHRQ $1, LEN // LEN = floor( LEN / 2 )
JZ tail_one
tail_two: // do {
MOVSD (X_PTR), X2 // X_i = x[i]
MOVSD (X_PTR)(INC_X*1), X3
MULSD ALPHA, X2 // X_i *= a
MULSD ALPHA, X3
MOVSD X2, (X_PTR) // x[i] = X_i
MOVSD X3, (X_PTR)(INC_X*1)
LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
ANDQ $1, TAIL
JZ end
tail_one:
MOVSD (X_PTR), X2 // X_i = x[i]
MULSD ALPHA, X2 // X_i *= ALPHA
MOVSD X2, (X_PTR) // x[i] = X_i
end:
RET

View File

@@ -38,50 +38,85 @@
#include "textflag.h"
// func DscalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr)
#define X_PTR SI
#define DST_PTR DI
#define LEN CX
#define TAIL BX
#define INC_X R8
#define INCx3_X R9
#define INC_DST R10
#define INCx3_DST R11
#define ALPHA X0
#define ALPHA_2 X1
// func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr)
TEXT ·ScalIncTo(SB), NOSPLIT, $0
MOVQ dst+0(FP), R9
MOVQ incDst+24(FP), R11
MOVHPD alpha+32(FP), X7
MOVLPD alpha+32(FP), X7
MOVQ x+40(FP), R8
MOVQ n+64(FP), DX
MOVQ incX+72(FP), R10
MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst
MOVQ incDst+24(FP), INC_DST // INC_DST = incDst
SHLQ $3, INC_DST // INC_DST *= sizeof(float64)
MOVSD alpha+32(FP), ALPHA // ALPHA = alpha
MOVQ x_base+40(FP), X_PTR // X_PTR = &x
MOVQ n+64(FP), LEN // LEN = n
MOVQ incX+72(FP), INC_X // INC_X = incX
SHLQ $3, INC_X // INC_X *= sizeof(float64)
CMPQ LEN, $0
JE end // if LEN == 0 { return }
MOVQ $0, SI
MOVQ $0, DI
MOVQ R10, AX // nextX = incX
MOVQ R11, BX // nextDst = incDst
SHLQ $1, R10 // incX *= 2
SHLQ $1, R11 // incDst *= 2
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ tail_start // if LEN == 0 { goto tail_start }
SUBQ $2, DX // n -= 2
JL tail // if n < 0
MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
loop:
// dst[i] = alpha * x[i] unrolled 2x.
MOVHPD 0(R8)(SI*8), X0
MOVLPD 0(R8)(AX*8), X0
MULPD X7, X0
MOVHPD X0, 0(R9)(DI*8)
MOVLPD X0, 0(R9)(BX*8)
loop: // do { // x[i] *= alpha unrolled 4x.
MOVSD (X_PTR), X2 // X_i = x[i]
MOVSD (X_PTR)(INC_X*1), X3
MOVSD (X_PTR)(INC_X*2), X4
MOVSD (X_PTR)(INCx3_X*1), X5
ADDQ R10, SI // ix += incX
ADDQ R10, AX // nextX += incX
ADDQ R11, DI // idst += incDst
ADDQ R11, BX // nextDst += incDst
MULSD ALPHA, X2 // X_i *= a
MULSD ALPHA_2, X3
MULSD ALPHA, X4
MULSD ALPHA_2, X5
SUBQ $2, DX // n -= 2
JGE loop // if n >= 0 goto loop
MOVSD X2, (DST_PTR) // dst[i] = X_i
MOVSD X3, (DST_PTR)(INC_DST*1)
MOVSD X4, (DST_PTR)(INC_DST*2)
MOVSD X5, (DST_PTR)(INCx3_DST*1)
tail:
ADDQ $2, DX // n += 2
JLE end // if n <= 0
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4])
DECQ LEN
JNZ loop // } while --LEN > 0
CMPQ TAIL, $0
JE end // if TAIL == 0 { return }
// dst[i] = alpha * x[i] for the last iteration if n is odd.
MOVSD 0(R8)(SI*8), X0
MULSD X7, X0
MOVSD X0, 0(R9)(DI*8)
tail_start: // Reset loop registers
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
SHRQ $1, LEN // LEN = floor( LEN / 2 )
JZ tail_one
tail_two:
MOVSD (X_PTR), X2 // X_i = x[i]
MOVSD (X_PTR)(INC_X*1), X3
MULSD ALPHA, X2 // X_i *= a
MULSD ALPHA, X3
MOVSD X2, (DST_PTR) // dst[i] = X_i
MOVSD X3, (DST_PTR)(INC_DST*1)
LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incDst*2])
ANDQ $1, TAIL
JZ end
tail_one:
MOVSD (X_PTR), X2 // X_i = x[i]
MULSD ALPHA, X2 // X_i *= ALPHA
MOVSD X2, (DST_PTR) // x[i] = X_i
end:
RET

View File

@@ -38,43 +38,75 @@
#include "textflag.h"
// func DscalUnitary(alpha float64, x []float64)
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // @ MOVDDUP XMM0, 8[RSP]
#define X_PTR SI
#define DST_PTR DI
#define IDX AX
#define LEN CX
#define TAIL BX
#define ALPHA X0
#define ALPHA_2 X1
// func ScalUnitary(alpha float64, x []float64)
TEXT ·ScalUnitary(SB), NOSPLIT, $0
MOVHPD alpha+0(FP), X7
MOVLPD alpha+0(FP), X7
MOVQ x+8(FP), R8
MOVQ x_len+16(FP), DI // n = len(x)
MOVDDUP_ALPHA // ALPHA = { alpha, alpha }
MOVQ x_base+8(FP), X_PTR // X_PTR = &x
MOVQ x_len+16(FP), LEN // LEN = len(x)
CMPQ LEN, $0
JE end // if LEN == 0 { return }
XORQ IDX, IDX // IDX = 0
MOVQ $0, SI // i = 0
SUBQ $4, DI // n -= 4
JL tail // if n < 0 goto tail
MOVQ LEN, TAIL
ANDQ $7, TAIL // TAIL = LEN % 8
SHRQ $3, LEN // LEN = floor( LEN / 8 )
JZ tail_start // if LEN == 0 { goto tail_start }
loop:
// x[i] *= alpha unrolled 4x.
MOVUPD 0(R8)(SI*8), X0
MOVUPD 16(R8)(SI*8), X1
MULPD X7, X0
MULPD X7, X1
MOVUPD X0, 0(R8)(SI*8)
MOVUPD X1, 16(R8)(SI*8)
MOVUPS ALPHA, ALPHA_2
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
JGE loop // if n >= 0 goto loop
loop: // do { // x[i] *= alpha unrolled 8x.
MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
MOVUPS 16(X_PTR)(IDX*8), X3
MOVUPS 32(X_PTR)(IDX*8), X4
MOVUPS 48(X_PTR)(IDX*8), X5
tail:
ADDQ $4, DI // n += 4
JZ end // if n == 0 goto end
MULPD ALPHA, X2 // X_i *= ALPHA
MULPD ALPHA_2, X3
MULPD ALPHA, X4
MULPD ALPHA_2, X5
onemore:
// x[i] *= alpha for the remaining 1-3 elements.
MOVSD 0(R8)(SI*8), X0
MULSD X7, X0
MOVSD X0, 0(R8)(SI*8)
MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i
MOVUPS X3, 16(X_PTR)(IDX*8)
MOVUPS X4, 32(X_PTR)(IDX*8)
MOVUPS X5, 48(X_PTR)(IDX*8)
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JNZ onemore // if n != 0 goto onemore
ADDQ $8, IDX // i += 8
DECQ LEN
JNZ loop // while --LEN > 0
CMPQ TAIL, $0
JE end // if TAIL == 0 { return }
tail_start: // Reset loop registers
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
SHRQ $1, LEN // LEN = floor( TAIL / 2 )
JZ tail_one // if n == 0 goto end
tail_two: // do {
MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
MULPD ALPHA, X2 // X_i *= ALPHA
MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i
ADDQ $2, IDX // i += 2
DECQ LEN
JNZ tail_two // while --LEN > 0
ANDQ $1, TAIL
JZ end // if TAIL == 0 { return }
tail_one:
// x[i] *= alpha for the remaining element.
MOVSD (X_PTR)(IDX*8), X2
MULSD ALPHA, X2
MOVSD X2, (X_PTR)(IDX*8)
end:
RET

View File

@@ -38,45 +38,76 @@
#include "textflag.h"
// func DscalUnitaryTo(dst []float64, alpha float64, x []float64)
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x2024 // @ MOVDDUP 32(SP), X0 /*XMM0, 32[RSP]*/
#define X_PTR SI
#define DST_PTR DI
#define IDX AX
#define LEN CX
#define TAIL BX
#define ALPHA X0
#define ALPHA_2 X1
// func ScalUnitaryTo(dst []float64, alpha float64, x []float64)
// This function assumes len(dst) >= len(x).
TEXT ·ScalUnitaryTo(SB), NOSPLIT, $0
MOVQ dst+0(FP), R9
MOVHPD alpha+24(FP), X7
MOVLPD alpha+24(FP), X7
MOVQ x+32(FP), R8
MOVQ x_len+40(FP), DI // n = len(x)
MOVQ x_base+32(FP), X_PTR // X_PTR = &x
MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst
MOVDDUP_ALPHA // ALPHA = { alpha, alpha }
MOVQ x_len+40(FP), LEN // LEN = len(x)
CMPQ LEN, $0
JE end // if LEN == 0 { return }
MOVQ $0, SI // i = 0
SUBQ $4, DI // n -= 4
JL tail // if n < 0 goto tail
XORQ IDX, IDX // IDX = 0
MOVQ LEN, TAIL
ANDQ $7, TAIL // TAIL = LEN % 8
SHRQ $3, LEN // LEN = floor( LEN / 8 )
JZ tail_start // if LEN == 0 { goto tail_start }
loop:
// dst[i] = alpha * x[i] unrolled 4x.
MOVUPD 0(R8)(SI*8), X0
MOVUPD 16(R8)(SI*8), X1
MULPD X7, X0
MULPD X7, X1
MOVUPD X0, 0(R9)(SI*8)
MOVUPD X1, 16(R9)(SI*8)
MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
JGE loop // if n >= 0 goto loop
loop: // do { // dst[i] = alpha * x[i] unrolled 8x.
MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
MOVUPS 16(X_PTR)(IDX*8), X3
MOVUPS 32(X_PTR)(IDX*8), X4
MOVUPS 48(X_PTR)(IDX*8), X5
tail:
ADDQ $4, DI // n += 4
JZ end // if n == 0 goto end
MULPD ALPHA, X2 // X_i *= ALPHA
MULPD ALPHA_2, X3
MULPD ALPHA, X4
MULPD ALPHA_2, X5
onemore:
// dst[i] = alpha * x[i] for the remaining 1-3 elements.
MOVSD 0(R8)(SI*8), X0
MULSD X7, X0
MOVSD X0, 0(R9)(SI*8)
MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i
MOVUPS X3, 16(DST_PTR)(IDX*8)
MOVUPS X4, 32(DST_PTR)(IDX*8)
MOVUPS X5, 48(DST_PTR)(IDX*8)
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JNZ onemore // if n != 0 goto onemore
ADDQ $8, IDX // i += 8
DECQ LEN
JNZ loop // while --LEN > 0
CMPQ TAIL, $0
JE end // if TAIL == 0 { return }
tail_start: // Reset loop counters
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
SHRQ $1, LEN // LEN = floor( TAIL / 2 )
JZ tail_one // if LEN == 0 { goto tail_one }
tail_two: // do {
MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
MULPD ALPHA, X2 // X_i *= ALPHA
MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i
ADDQ $2, IDX // i += 2
DECQ LEN
JNZ tail_two // while --LEN > 0
ANDQ $1, TAIL
JZ end // if TAIL == 0 { return }
tail_one:
MOVSD (X_PTR)(IDX*8), X2 // X_i = x[i]
MULSD ALPHA, X2 // X_i *= ALPHA
MOVSD X2, (DST_PTR)(IDX*8) // dst[i] = X_i
end:
RET