mirror of
https://github.com/gonum/gonum.git
synced 2025-10-24 07:34:11 +08:00
Merge pull request #53 from Kunde21/f64Scal
asm/f64: Updated scal assembly to wide, pipelined loops.
This commit is contained in:
@@ -140,6 +140,7 @@ func same(a, b float64) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var ( // Offset sets for testing alignment handling in Unitary assembly functions.
|
var ( // Offset sets for testing alignment handling in Unitary assembly functions.
|
||||||
|
align1 = []int{0, 1}
|
||||||
align2 = newIncSet(0, 1)
|
align2 = newIncSet(0, 1)
|
||||||
align3 = newIncToSet(0, 1)
|
align3 = newIncToSet(0, 1)
|
||||||
)
|
)
|
||||||
@@ -190,3 +191,14 @@ func randomSlice(n, inc int) []float64 {
|
|||||||
}
|
}
|
||||||
return x
|
return x
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func randSlice(n, inc int, r *rand.Rand) []float64 {
|
||||||
|
if inc < 0 {
|
||||||
|
inc = -inc
|
||||||
|
}
|
||||||
|
x := make([]float64, (n-1)*inc+1)
|
||||||
|
for i := range x {
|
||||||
|
x[i] = r.Float64()
|
||||||
|
}
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
|||||||
87
asm/f64/benchScal_test.go
Normal file
87
asm/f64/benchScal_test.go
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
// Copyright ©2017 The gonum Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// +build go1.7
|
||||||
|
|
||||||
|
package f64
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
var uniScal = []int64{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4}
|
||||||
|
|
||||||
|
func BenchmarkScalUnitary(t *testing.B) {
|
||||||
|
tstName := "ScalUnitary"
|
||||||
|
for _, ln := range uniScal {
|
||||||
|
t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) {
|
||||||
|
b.SetBytes(64 * ln)
|
||||||
|
x := x[:ln]
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
ScalUnitary(a, x)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkScalUnitaryTo(t *testing.B) {
|
||||||
|
tstName := "ScalUnitaryTo"
|
||||||
|
for _, ln := range uniScal {
|
||||||
|
t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) {
|
||||||
|
b.SetBytes(int64(64 * ln))
|
||||||
|
x, y := x[:ln], y[:ln]
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
ScalUnitaryTo(y, a, x)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var incScal = []struct {
|
||||||
|
len uintptr
|
||||||
|
inc []int
|
||||||
|
}{
|
||||||
|
{1, []int{1}},
|
||||||
|
{3, []int{1, 2, 4, 10}},
|
||||||
|
{10, []int{1, 2, 4, 10}},
|
||||||
|
{30, []int{1, 2, 4, 10}},
|
||||||
|
{1e2, []int{1, 2, 4, 10}},
|
||||||
|
{3e2, []int{1, 2, 4, 10}},
|
||||||
|
{1e3, []int{1, 2, 4, 10}},
|
||||||
|
{3e3, []int{1, 2, 4, 10}},
|
||||||
|
{1e4, []int{1, 2, 4, 10}},
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkScalInc(t *testing.B) {
|
||||||
|
tstName := "ScalInc"
|
||||||
|
for _, tt := range incScal {
|
||||||
|
for _, inc := range tt.inc {
|
||||||
|
t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) {
|
||||||
|
b.SetBytes(int64(64 * tt.len))
|
||||||
|
tstInc := uintptr(inc)
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
ScalInc(a, x, uintptr(tt.len), tstInc)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkScalIncTo(t *testing.B) {
|
||||||
|
tstName := "ScalIncTo"
|
||||||
|
for _, tt := range incScal {
|
||||||
|
for _, inc := range tt.inc {
|
||||||
|
t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) {
|
||||||
|
b.SetBytes(int64(64 * tt.len))
|
||||||
|
tstInc := uintptr(inc)
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
ScalIncTo(z, tstInc, a, x, uintptr(tt.len), tstInc)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -6,7 +6,10 @@
|
|||||||
|
|
||||||
package f64
|
package f64
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"math/rand"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
a = float64(2)
|
a = float64(2)
|
||||||
@@ -286,3 +289,150 @@ func BenchmarkLF64AxpyIncToN100000IncM1(b *testing.B) { benchaxpyincto(b, 10000
|
|||||||
func BenchmarkLF64AxpyIncToN100000IncM2(b *testing.B) { benchaxpyincto(b, 100000, -2, naiveaxpyincto) }
|
func BenchmarkLF64AxpyIncToN100000IncM2(b *testing.B) { benchaxpyincto(b, 100000, -2, naiveaxpyincto) }
|
||||||
func BenchmarkLF64AxpyIncToN100000IncM4(b *testing.B) { benchaxpyincto(b, 100000, -4, naiveaxpyincto) }
|
func BenchmarkLF64AxpyIncToN100000IncM4(b *testing.B) { benchaxpyincto(b, 100000, -4, naiveaxpyincto) }
|
||||||
func BenchmarkLF64AxpyIncToN100000IncM10(b *testing.B) { benchaxpyincto(b, 100000, -10, naiveaxpyincto) }
|
func BenchmarkLF64AxpyIncToN100000IncM10(b *testing.B) { benchaxpyincto(b, 100000, -10, naiveaxpyincto) }
|
||||||
|
|
||||||
|
// Scal* benchmarks
|
||||||
|
func BenchmarkDscalUnitaryN1(b *testing.B) { benchmarkDscalUnitary(b, 1) }
|
||||||
|
func BenchmarkDscalUnitaryN2(b *testing.B) { benchmarkDscalUnitary(b, 2) }
|
||||||
|
func BenchmarkDscalUnitaryN3(b *testing.B) { benchmarkDscalUnitary(b, 3) }
|
||||||
|
func BenchmarkDscalUnitaryN4(b *testing.B) { benchmarkDscalUnitary(b, 4) }
|
||||||
|
func BenchmarkDscalUnitaryN10(b *testing.B) { benchmarkDscalUnitary(b, 10) }
|
||||||
|
func BenchmarkDscalUnitaryN100(b *testing.B) { benchmarkDscalUnitary(b, 100) }
|
||||||
|
func BenchmarkDscalUnitaryN1000(b *testing.B) { benchmarkDscalUnitary(b, 1000) }
|
||||||
|
func BenchmarkDscalUnitaryN10000(b *testing.B) { benchmarkDscalUnitary(b, 10000) }
|
||||||
|
func BenchmarkDscalUnitaryN100000(b *testing.B) { benchmarkDscalUnitary(b, 100000) }
|
||||||
|
|
||||||
|
func benchmarkDscalUnitary(b *testing.B, n int) {
|
||||||
|
x := randomSlice(n, 1)
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i += 2 {
|
||||||
|
ScalUnitary(2, x)
|
||||||
|
ScalUnitary(0.5, x)
|
||||||
|
}
|
||||||
|
benchSink = x
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkDscalUnitaryToN1(b *testing.B) { benchmarkDscalUnitaryTo(b, 1) }
|
||||||
|
func BenchmarkDscalUnitaryToN2(b *testing.B) { benchmarkDscalUnitaryTo(b, 2) }
|
||||||
|
func BenchmarkDscalUnitaryToN3(b *testing.B) { benchmarkDscalUnitaryTo(b, 3) }
|
||||||
|
func BenchmarkDscalUnitaryToN4(b *testing.B) { benchmarkDscalUnitaryTo(b, 4) }
|
||||||
|
func BenchmarkDscalUnitaryToN10(b *testing.B) { benchmarkDscalUnitaryTo(b, 10) }
|
||||||
|
func BenchmarkDscalUnitaryToN100(b *testing.B) { benchmarkDscalUnitaryTo(b, 100) }
|
||||||
|
func BenchmarkDscalUnitaryToN1000(b *testing.B) { benchmarkDscalUnitaryTo(b, 1000) }
|
||||||
|
func BenchmarkDscalUnitaryToN10000(b *testing.B) { benchmarkDscalUnitaryTo(b, 10000) }
|
||||||
|
func BenchmarkDscalUnitaryToN100000(b *testing.B) { benchmarkDscalUnitaryTo(b, 100000) }
|
||||||
|
|
||||||
|
func benchmarkDscalUnitaryTo(b *testing.B, n int) {
|
||||||
|
x := randomSlice(n, 1)
|
||||||
|
dst := randomSlice(n, 1)
|
||||||
|
a := rand.Float64()
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
ScalUnitaryTo(dst, a, x)
|
||||||
|
}
|
||||||
|
benchSink = dst
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkDscalUnitaryToXN1(b *testing.B) { benchmarkDscalUnitaryToX(b, 1) }
|
||||||
|
func BenchmarkDscalUnitaryToXN2(b *testing.B) { benchmarkDscalUnitaryToX(b, 2) }
|
||||||
|
func BenchmarkDscalUnitaryToXN3(b *testing.B) { benchmarkDscalUnitaryToX(b, 3) }
|
||||||
|
func BenchmarkDscalUnitaryToXN4(b *testing.B) { benchmarkDscalUnitaryToX(b, 4) }
|
||||||
|
func BenchmarkDscalUnitaryToXN10(b *testing.B) { benchmarkDscalUnitaryToX(b, 10) }
|
||||||
|
func BenchmarkDscalUnitaryToXN100(b *testing.B) { benchmarkDscalUnitaryToX(b, 100) }
|
||||||
|
func BenchmarkDscalUnitaryToXN1000(b *testing.B) { benchmarkDscalUnitaryToX(b, 1000) }
|
||||||
|
func BenchmarkDscalUnitaryToXN10000(b *testing.B) { benchmarkDscalUnitaryToX(b, 10000) }
|
||||||
|
func BenchmarkDscalUnitaryToXN100000(b *testing.B) { benchmarkDscalUnitaryToX(b, 100000) }
|
||||||
|
|
||||||
|
func benchmarkDscalUnitaryToX(b *testing.B, n int) {
|
||||||
|
x := randomSlice(n, 1)
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i += 2 {
|
||||||
|
ScalUnitaryTo(x, 2, x)
|
||||||
|
ScalUnitaryTo(x, 0.5, x)
|
||||||
|
}
|
||||||
|
benchSink = x
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkDscalIncN1Inc1(b *testing.B) { benchmarkDscalInc(b, 1, 1) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncN2Inc1(b *testing.B) { benchmarkDscalInc(b, 2, 1) }
|
||||||
|
func BenchmarkDscalIncN2Inc2(b *testing.B) { benchmarkDscalInc(b, 2, 2) }
|
||||||
|
func BenchmarkDscalIncN2Inc4(b *testing.B) { benchmarkDscalInc(b, 2, 4) }
|
||||||
|
func BenchmarkDscalIncN2Inc10(b *testing.B) { benchmarkDscalInc(b, 2, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncN3Inc1(b *testing.B) { benchmarkDscalInc(b, 3, 1) }
|
||||||
|
func BenchmarkDscalIncN3Inc2(b *testing.B) { benchmarkDscalInc(b, 3, 2) }
|
||||||
|
func BenchmarkDscalIncN3Inc4(b *testing.B) { benchmarkDscalInc(b, 3, 4) }
|
||||||
|
func BenchmarkDscalIncN3Inc10(b *testing.B) { benchmarkDscalInc(b, 3, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncN4Inc1(b *testing.B) { benchmarkDscalInc(b, 4, 1) }
|
||||||
|
func BenchmarkDscalIncN4Inc2(b *testing.B) { benchmarkDscalInc(b, 4, 2) }
|
||||||
|
func BenchmarkDscalIncN4Inc4(b *testing.B) { benchmarkDscalInc(b, 4, 4) }
|
||||||
|
func BenchmarkDscalIncN4Inc10(b *testing.B) { benchmarkDscalInc(b, 4, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncN10Inc1(b *testing.B) { benchmarkDscalInc(b, 10, 1) }
|
||||||
|
func BenchmarkDscalIncN10Inc2(b *testing.B) { benchmarkDscalInc(b, 10, 2) }
|
||||||
|
func BenchmarkDscalIncN10Inc4(b *testing.B) { benchmarkDscalInc(b, 10, 4) }
|
||||||
|
func BenchmarkDscalIncN10Inc10(b *testing.B) { benchmarkDscalInc(b, 10, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncN1000Inc1(b *testing.B) { benchmarkDscalInc(b, 1000, 1) }
|
||||||
|
func BenchmarkDscalIncN1000Inc2(b *testing.B) { benchmarkDscalInc(b, 1000, 2) }
|
||||||
|
func BenchmarkDscalIncN1000Inc4(b *testing.B) { benchmarkDscalInc(b, 1000, 4) }
|
||||||
|
func BenchmarkDscalIncN1000Inc10(b *testing.B) { benchmarkDscalInc(b, 1000, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncN100000Inc1(b *testing.B) { benchmarkDscalInc(b, 100000, 1) }
|
||||||
|
func BenchmarkDscalIncN100000Inc2(b *testing.B) { benchmarkDscalInc(b, 100000, 2) }
|
||||||
|
func BenchmarkDscalIncN100000Inc4(b *testing.B) { benchmarkDscalInc(b, 100000, 4) }
|
||||||
|
func BenchmarkDscalIncN100000Inc10(b *testing.B) { benchmarkDscalInc(b, 100000, 10) }
|
||||||
|
|
||||||
|
func benchmarkDscalInc(b *testing.B, n, inc int) {
|
||||||
|
x := randomSlice(n, inc)
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i += 2 {
|
||||||
|
ScalInc(2, x, uintptr(n), uintptr(inc))
|
||||||
|
ScalInc(0.5, x, uintptr(n), uintptr(inc))
|
||||||
|
}
|
||||||
|
benchSink = x
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkDscalIncToN1Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1, 1) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncToN2Inc1(b *testing.B) { benchmarkDscalIncTo(b, 2, 1) }
|
||||||
|
func BenchmarkDscalIncToN2Inc2(b *testing.B) { benchmarkDscalIncTo(b, 2, 2) }
|
||||||
|
func BenchmarkDscalIncToN2Inc4(b *testing.B) { benchmarkDscalIncTo(b, 2, 4) }
|
||||||
|
func BenchmarkDscalIncToN2Inc10(b *testing.B) { benchmarkDscalIncTo(b, 2, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncToN3Inc1(b *testing.B) { benchmarkDscalIncTo(b, 3, 1) }
|
||||||
|
func BenchmarkDscalIncToN3Inc2(b *testing.B) { benchmarkDscalIncTo(b, 3, 2) }
|
||||||
|
func BenchmarkDscalIncToN3Inc4(b *testing.B) { benchmarkDscalIncTo(b, 3, 4) }
|
||||||
|
func BenchmarkDscalIncToN3Inc10(b *testing.B) { benchmarkDscalIncTo(b, 3, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncToN4Inc1(b *testing.B) { benchmarkDscalIncTo(b, 4, 1) }
|
||||||
|
func BenchmarkDscalIncToN4Inc2(b *testing.B) { benchmarkDscalIncTo(b, 4, 2) }
|
||||||
|
func BenchmarkDscalIncToN4Inc4(b *testing.B) { benchmarkDscalIncTo(b, 4, 4) }
|
||||||
|
func BenchmarkDscalIncToN4Inc10(b *testing.B) { benchmarkDscalIncTo(b, 4, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncToN10Inc1(b *testing.B) { benchmarkDscalIncTo(b, 10, 1) }
|
||||||
|
func BenchmarkDscalIncToN10Inc2(b *testing.B) { benchmarkDscalIncTo(b, 10, 2) }
|
||||||
|
func BenchmarkDscalIncToN10Inc4(b *testing.B) { benchmarkDscalIncTo(b, 10, 4) }
|
||||||
|
func BenchmarkDscalIncToN10Inc10(b *testing.B) { benchmarkDscalIncTo(b, 10, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncToN1000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1000, 1) }
|
||||||
|
func BenchmarkDscalIncToN1000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 1000, 2) }
|
||||||
|
func BenchmarkDscalIncToN1000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 1000, 4) }
|
||||||
|
func BenchmarkDscalIncToN1000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 1000, 10) }
|
||||||
|
|
||||||
|
func BenchmarkDscalIncToN100000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 100000, 1) }
|
||||||
|
func BenchmarkDscalIncToN100000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 100000, 2) }
|
||||||
|
func BenchmarkDscalIncToN100000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 100000, 4) }
|
||||||
|
func BenchmarkDscalIncToN100000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 100000, 10) }
|
||||||
|
|
||||||
|
func benchmarkDscalIncTo(b *testing.B, n, inc int) {
|
||||||
|
x := randomSlice(n, inc)
|
||||||
|
dst := randomSlice(n, inc)
|
||||||
|
a := rand.Float64()
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
ScalIncTo(dst, uintptr(inc), a, x, uintptr(n), uintptr(inc))
|
||||||
|
}
|
||||||
|
benchSink = dst
|
||||||
|
}
|
||||||
|
|||||||
@@ -10,11 +10,16 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
var dscalTests = []struct {
|
var scalTests = []struct {
|
||||||
alpha float64
|
alpha float64
|
||||||
x []float64
|
x []float64
|
||||||
want []float64
|
want []float64
|
||||||
}{
|
}{
|
||||||
|
{
|
||||||
|
alpha: 0,
|
||||||
|
x: []float64{},
|
||||||
|
want: []float64{},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
alpha: 0,
|
alpha: 0,
|
||||||
x: []float64{1},
|
x: []float64{1},
|
||||||
@@ -60,281 +65,115 @@ var dscalTests = []struct {
|
|||||||
x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9},
|
x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9},
|
||||||
want: []float64{0, 2, -4, 6, 8, -10, 12, -14, 16, 18},
|
want: []float64{0, 2, -4, 6, 8, -10, 12, -14, 16, 18},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
alpha: 3,
|
||||||
|
x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9, 12},
|
||||||
|
want: []float64{0, 3, -6, 9, 12, -15, 18, -21, 24, 27, 36},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestDscalUnitary(t *testing.T) {
|
func TestScalUnitary(t *testing.T) {
|
||||||
for i, test := range dscalTests {
|
const xGdVal = -0.5
|
||||||
const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
|
for i, test := range scalTests {
|
||||||
|
for _, align := range align1 {
|
||||||
|
prefix := fmt.Sprintf("Test %v (x:%v)", i, align)
|
||||||
|
xgLn := 4 + align
|
||||||
|
xg := guardVector(test.x, xGdVal, xgLn)
|
||||||
|
x := xg[xgLn : len(xg)-xgLn]
|
||||||
|
|
||||||
prefix := fmt.Sprintf("test %v (x*=a)", i)
|
ScalUnitary(test.alpha, x)
|
||||||
x, xFront, xBack := newGuardedVector(test.x, 1)
|
|
||||||
ScalUnitary(test.alpha, x)
|
|
||||||
|
|
||||||
if !allNaN(xFront) || !allNaN(xBack) {
|
for i := range test.want {
|
||||||
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
|
if !same(x[i], test.want[i]) {
|
||||||
}
|
t.Errorf(msgVal, prefix, i, x[i], test.want[i])
|
||||||
|
}
|
||||||
if !equalStrided(test.want, x, 1) {
|
}
|
||||||
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x)
|
if !isValidGuard(xg, xGdVal, xgLn) {
|
||||||
|
t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestDscalUnitaryTo(t *testing.T) {
|
func TestScalUnitaryTo(t *testing.T) {
|
||||||
for i, test := range dscalTests {
|
const xGdVal, dstGdVal = -1, 0.5
|
||||||
const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
|
rng := rand.New(rand.NewSource(42))
|
||||||
|
for i, test := range scalTests {
|
||||||
|
n := len(test.x)
|
||||||
|
for _, align := range align2 {
|
||||||
|
prefix := fmt.Sprintf("Test %v (x:%v dst:%v)", i, align.x, align.y)
|
||||||
|
xgLn, dgLn := 4+align.x, 4+align.y
|
||||||
|
xg := guardVector(test.x, xGdVal, xgLn)
|
||||||
|
dg := guardVector(randSlice(n, 1, rng), dstGdVal, dgLn)
|
||||||
|
x, dst := xg[xgLn:len(xg)-xgLn], dg[dgLn:len(dg)-dgLn]
|
||||||
|
|
||||||
// Test dst = alpha * x.
|
ScalUnitaryTo(dst, test.alpha, x)
|
||||||
prefix := fmt.Sprintf("test %v (dst=a*x)", i)
|
|
||||||
x, xFront, xBack := newGuardedVector(test.x, 1)
|
|
||||||
dst, dstFront, dstBack := newGuardedVector(test.x, 1)
|
|
||||||
ScalUnitaryTo(dst, test.alpha, x)
|
|
||||||
|
|
||||||
if !allNaN(xFront) || !allNaN(xBack) {
|
for i := range test.want {
|
||||||
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
|
if !same(dst[i], test.want[i]) {
|
||||||
}
|
t.Errorf(msgVal, prefix, i, dst[i], test.want[i])
|
||||||
if !allNaN(dstFront) || !allNaN(dstBack) {
|
}
|
||||||
t.Errorf(msgGuard, prefix, "dst", dstFront, dstBack)
|
}
|
||||||
}
|
if !isValidGuard(xg, xGdVal, xgLn) {
|
||||||
if !equalStrided(test.x, x, 1) {
|
t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:])
|
||||||
t.Errorf("%v: modified read-only x argument", prefix)
|
}
|
||||||
}
|
if !isValidGuard(dg, dstGdVal, dgLn) {
|
||||||
|
t.Errorf(msgGuard, prefix, "y", dg[:dgLn], dg[len(dg)-dgLn:])
|
||||||
if !equalStrided(test.want, dst, 1) {
|
}
|
||||||
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, dst)
|
if !equalStrided(test.x, x, 1) {
|
||||||
}
|
t.Errorf("%v: modified read-only x argument", prefix)
|
||||||
|
}
|
||||||
// Test x = alpha * x.
|
|
||||||
prefix = fmt.Sprintf("test %v (x=a*x)", i)
|
|
||||||
x, xFront, xBack = newGuardedVector(test.x, 1)
|
|
||||||
ScalUnitaryTo(x, test.alpha, x)
|
|
||||||
|
|
||||||
if !allNaN(xFront) || !allNaN(xBack) {
|
|
||||||
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !equalStrided(test.want, x, 1) {
|
|
||||||
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestDscalInc(t *testing.T) {
|
func TestScalInc(t *testing.T) {
|
||||||
const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
|
const xGdVal = -0.5
|
||||||
|
gdLn := 4
|
||||||
for i, test := range dscalTests {
|
for i, test := range scalTests {
|
||||||
n := len(test.x)
|
n := len(test.x)
|
||||||
for _, incX := range []int{1, 2, 3, 4, 7, 10} {
|
for _, incX := range []int{1, 2, 3, 4, 7, 10} {
|
||||||
prefix := fmt.Sprintf("test %v (x*=a), incX = %v", i, incX)
|
prefix := fmt.Sprintf("Test %v (x:%v)", i, incX)
|
||||||
x, xFront, xBack := newGuardedVector(test.x, incX)
|
xg := guardIncVector(test.x, xGdVal, incX, gdLn)
|
||||||
|
x := xg[gdLn : len(xg)-gdLn]
|
||||||
|
|
||||||
ScalInc(test.alpha, x, uintptr(n), uintptr(incX))
|
ScalInc(test.alpha, x, uintptr(n), uintptr(incX))
|
||||||
|
|
||||||
if !allNaN(xFront) || !allNaN(xBack) {
|
for i := range test.want {
|
||||||
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
|
if !same(x[i*incX], test.want[i]) {
|
||||||
}
|
t.Errorf(msgVal, prefix, i, x[i*incX], test.want[i])
|
||||||
if nonStridedWrite(x, incX) {
|
}
|
||||||
t.Errorf("%v: modified x argument at non-stride position", prefix)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !equalStrided(test.want, x, incX) {
|
|
||||||
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x)
|
|
||||||
}
|
}
|
||||||
|
checkValidIncGuard(t, xg, xGdVal, incX, gdLn)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestDscalIncTo(t *testing.T) {
|
func TestScalIncTo(t *testing.T) {
|
||||||
const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
|
const xGdVal, dstGdVal = -1, 0.5
|
||||||
|
gdLn := 4
|
||||||
for i, test := range dscalTests {
|
rng := rand.New(rand.NewSource(42))
|
||||||
|
for i, test := range scalTests {
|
||||||
n := len(test.x)
|
n := len(test.x)
|
||||||
|
for _, inc := range newIncSet(1, 2, 3, 4, 7, 10) {
|
||||||
|
prefix := fmt.Sprintf("test %v (x:%v dst:%v)", i, inc.x, inc.y)
|
||||||
|
xg := guardIncVector(test.x, xGdVal, inc.x, gdLn)
|
||||||
|
dg := guardIncVector(randSlice(n, 1, rng), dstGdVal, inc.y, gdLn)
|
||||||
|
x, dst := xg[gdLn:len(xg)-gdLn], dg[gdLn:len(dg)-gdLn]
|
||||||
|
|
||||||
for _, incX := range []int{1, 2, 3, 4, 7, 10} {
|
ScalIncTo(dst, uintptr(inc.y), test.alpha, x, uintptr(n), uintptr(inc.x))
|
||||||
// Test x = alpha * x.
|
|
||||||
prefix := fmt.Sprintf("test %v (x=a*x), incX = %v", i, incX)
|
|
||||||
x, xFront, xBack := newGuardedVector(test.x, incX)
|
|
||||||
ScalIncTo(x, uintptr(incX), test.alpha, x, uintptr(n), uintptr(incX))
|
|
||||||
|
|
||||||
if !allNaN(xFront) || !allNaN(xBack) {
|
for i := range test.want {
|
||||||
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
|
if !same(dst[i*inc.y], test.want[i]) {
|
||||||
}
|
t.Errorf(msgVal, prefix, i, dst[i*inc.y], test.want[i])
|
||||||
if nonStridedWrite(x, incX) {
|
|
||||||
t.Errorf("%v: modified x argument at non-stride position", prefix)
|
|
||||||
}
|
|
||||||
if !equalStrided(test.want, x, incX) {
|
|
||||||
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, x)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, incDst := range []int{1, 2, 3, 4, 7, 10} {
|
|
||||||
// Test dst = alpha * x.
|
|
||||||
prefix = fmt.Sprintf("test %v (dst=a*x), incX = %v, incDst = %v", i, incX, incDst)
|
|
||||||
x, xFront, xBack = newGuardedVector(test.x, incX)
|
|
||||||
dst, dstFront, dstBack := newGuardedVector(test.x, incDst)
|
|
||||||
ScalIncTo(dst, uintptr(incDst), test.alpha, x, uintptr(n), uintptr(incX))
|
|
||||||
|
|
||||||
if !allNaN(xFront) || !allNaN(xBack) {
|
|
||||||
t.Errorf(msgGuard, prefix, "x", xFront, xBack)
|
|
||||||
}
|
|
||||||
if !allNaN(dstFront) || !allNaN(dstBack) {
|
|
||||||
t.Errorf(msgGuard, prefix, "dst", dstFront, dstBack)
|
|
||||||
}
|
|
||||||
if nonStridedWrite(x, incX) || !equalStrided(test.x, x, incX) {
|
|
||||||
t.Errorf("%v: modified read-only x argument", prefix)
|
|
||||||
}
|
|
||||||
if nonStridedWrite(dst, incDst) {
|
|
||||||
t.Errorf("%v: modified dst argument at non-stride position", prefix)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !equalStrided(test.want, dst, incDst) {
|
|
||||||
t.Errorf("%v: unexpected result:\nwant: %v\ngot: %v", prefix, test.want, dst)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
checkValidIncGuard(t, xg, xGdVal, inc.x, gdLn)
|
||||||
|
checkValidIncGuard(t, dg, dstGdVal, inc.y, gdLn)
|
||||||
|
if !equalStrided(test.x, x, inc.x) {
|
||||||
|
t.Errorf("%v: modified read-only x argument", prefix)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkDscalUnitaryN1(b *testing.B) { benchmarkDscalUnitary(b, 1) }
|
|
||||||
func BenchmarkDscalUnitaryN2(b *testing.B) { benchmarkDscalUnitary(b, 2) }
|
|
||||||
func BenchmarkDscalUnitaryN3(b *testing.B) { benchmarkDscalUnitary(b, 3) }
|
|
||||||
func BenchmarkDscalUnitaryN4(b *testing.B) { benchmarkDscalUnitary(b, 4) }
|
|
||||||
func BenchmarkDscalUnitaryN10(b *testing.B) { benchmarkDscalUnitary(b, 10) }
|
|
||||||
func BenchmarkDscalUnitaryN100(b *testing.B) { benchmarkDscalUnitary(b, 100) }
|
|
||||||
func BenchmarkDscalUnitaryN1000(b *testing.B) { benchmarkDscalUnitary(b, 1000) }
|
|
||||||
func BenchmarkDscalUnitaryN10000(b *testing.B) { benchmarkDscalUnitary(b, 10000) }
|
|
||||||
func BenchmarkDscalUnitaryN100000(b *testing.B) { benchmarkDscalUnitary(b, 100000) }
|
|
||||||
|
|
||||||
func benchmarkDscalUnitary(b *testing.B, n int) {
|
|
||||||
x := randomSlice(n, 1)
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i += 2 {
|
|
||||||
ScalUnitary(2, x)
|
|
||||||
ScalUnitary(0.5, x)
|
|
||||||
}
|
|
||||||
benchSink = x
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkDscalUnitaryToN1(b *testing.B) { benchmarkDscalUnitaryTo(b, 1) }
|
|
||||||
func BenchmarkDscalUnitaryToN2(b *testing.B) { benchmarkDscalUnitaryTo(b, 2) }
|
|
||||||
func BenchmarkDscalUnitaryToN3(b *testing.B) { benchmarkDscalUnitaryTo(b, 3) }
|
|
||||||
func BenchmarkDscalUnitaryToN4(b *testing.B) { benchmarkDscalUnitaryTo(b, 4) }
|
|
||||||
func BenchmarkDscalUnitaryToN10(b *testing.B) { benchmarkDscalUnitaryTo(b, 10) }
|
|
||||||
func BenchmarkDscalUnitaryToN100(b *testing.B) { benchmarkDscalUnitaryTo(b, 100) }
|
|
||||||
func BenchmarkDscalUnitaryToN1000(b *testing.B) { benchmarkDscalUnitaryTo(b, 1000) }
|
|
||||||
func BenchmarkDscalUnitaryToN10000(b *testing.B) { benchmarkDscalUnitaryTo(b, 10000) }
|
|
||||||
func BenchmarkDscalUnitaryToN100000(b *testing.B) { benchmarkDscalUnitaryTo(b, 100000) }
|
|
||||||
|
|
||||||
func benchmarkDscalUnitaryTo(b *testing.B, n int) {
|
|
||||||
x := randomSlice(n, 1)
|
|
||||||
dst := randomSlice(n, 1)
|
|
||||||
a := rand.Float64()
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
ScalUnitaryTo(dst, a, x)
|
|
||||||
}
|
|
||||||
benchSink = dst
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkDscalUnitaryToXN1(b *testing.B) { benchmarkDscalUnitaryToX(b, 1) }
|
|
||||||
func BenchmarkDscalUnitaryToXN2(b *testing.B) { benchmarkDscalUnitaryToX(b, 2) }
|
|
||||||
func BenchmarkDscalUnitaryToXN3(b *testing.B) { benchmarkDscalUnitaryToX(b, 3) }
|
|
||||||
func BenchmarkDscalUnitaryToXN4(b *testing.B) { benchmarkDscalUnitaryToX(b, 4) }
|
|
||||||
func BenchmarkDscalUnitaryToXN10(b *testing.B) { benchmarkDscalUnitaryToX(b, 10) }
|
|
||||||
func BenchmarkDscalUnitaryToXN100(b *testing.B) { benchmarkDscalUnitaryToX(b, 100) }
|
|
||||||
func BenchmarkDscalUnitaryToXN1000(b *testing.B) { benchmarkDscalUnitaryToX(b, 1000) }
|
|
||||||
func BenchmarkDscalUnitaryToXN10000(b *testing.B) { benchmarkDscalUnitaryToX(b, 10000) }
|
|
||||||
func BenchmarkDscalUnitaryToXN100000(b *testing.B) { benchmarkDscalUnitaryToX(b, 100000) }
|
|
||||||
|
|
||||||
func benchmarkDscalUnitaryToX(b *testing.B, n int) {
|
|
||||||
x := randomSlice(n, 1)
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i += 2 {
|
|
||||||
ScalUnitaryTo(x, 2, x)
|
|
||||||
ScalUnitaryTo(x, 0.5, x)
|
|
||||||
}
|
|
||||||
benchSink = x
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkDscalIncN1Inc1(b *testing.B) { benchmarkDscalInc(b, 1, 1) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncN2Inc1(b *testing.B) { benchmarkDscalInc(b, 2, 1) }
|
|
||||||
func BenchmarkDscalIncN2Inc2(b *testing.B) { benchmarkDscalInc(b, 2, 2) }
|
|
||||||
func BenchmarkDscalIncN2Inc4(b *testing.B) { benchmarkDscalInc(b, 2, 4) }
|
|
||||||
func BenchmarkDscalIncN2Inc10(b *testing.B) { benchmarkDscalInc(b, 2, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncN3Inc1(b *testing.B) { benchmarkDscalInc(b, 3, 1) }
|
|
||||||
func BenchmarkDscalIncN3Inc2(b *testing.B) { benchmarkDscalInc(b, 3, 2) }
|
|
||||||
func BenchmarkDscalIncN3Inc4(b *testing.B) { benchmarkDscalInc(b, 3, 4) }
|
|
||||||
func BenchmarkDscalIncN3Inc10(b *testing.B) { benchmarkDscalInc(b, 3, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncN4Inc1(b *testing.B) { benchmarkDscalInc(b, 4, 1) }
|
|
||||||
func BenchmarkDscalIncN4Inc2(b *testing.B) { benchmarkDscalInc(b, 4, 2) }
|
|
||||||
func BenchmarkDscalIncN4Inc4(b *testing.B) { benchmarkDscalInc(b, 4, 4) }
|
|
||||||
func BenchmarkDscalIncN4Inc10(b *testing.B) { benchmarkDscalInc(b, 4, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncN10Inc1(b *testing.B) { benchmarkDscalInc(b, 10, 1) }
|
|
||||||
func BenchmarkDscalIncN10Inc2(b *testing.B) { benchmarkDscalInc(b, 10, 2) }
|
|
||||||
func BenchmarkDscalIncN10Inc4(b *testing.B) { benchmarkDscalInc(b, 10, 4) }
|
|
||||||
func BenchmarkDscalIncN10Inc10(b *testing.B) { benchmarkDscalInc(b, 10, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncN1000Inc1(b *testing.B) { benchmarkDscalInc(b, 1000, 1) }
|
|
||||||
func BenchmarkDscalIncN1000Inc2(b *testing.B) { benchmarkDscalInc(b, 1000, 2) }
|
|
||||||
func BenchmarkDscalIncN1000Inc4(b *testing.B) { benchmarkDscalInc(b, 1000, 4) }
|
|
||||||
func BenchmarkDscalIncN1000Inc10(b *testing.B) { benchmarkDscalInc(b, 1000, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncN100000Inc1(b *testing.B) { benchmarkDscalInc(b, 100000, 1) }
|
|
||||||
func BenchmarkDscalIncN100000Inc2(b *testing.B) { benchmarkDscalInc(b, 100000, 2) }
|
|
||||||
func BenchmarkDscalIncN100000Inc4(b *testing.B) { benchmarkDscalInc(b, 100000, 4) }
|
|
||||||
func BenchmarkDscalIncN100000Inc10(b *testing.B) { benchmarkDscalInc(b, 100000, 10) }
|
|
||||||
|
|
||||||
func benchmarkDscalInc(b *testing.B, n, inc int) {
|
|
||||||
x := randomSlice(n, inc)
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i += 2 {
|
|
||||||
ScalInc(2, x, uintptr(n), uintptr(inc))
|
|
||||||
ScalInc(0.5, x, uintptr(n), uintptr(inc))
|
|
||||||
}
|
|
||||||
benchSink = x
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkDscalIncToN1Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1, 1) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncToN2Inc1(b *testing.B) { benchmarkDscalIncTo(b, 2, 1) }
|
|
||||||
func BenchmarkDscalIncToN2Inc2(b *testing.B) { benchmarkDscalIncTo(b, 2, 2) }
|
|
||||||
func BenchmarkDscalIncToN2Inc4(b *testing.B) { benchmarkDscalIncTo(b, 2, 4) }
|
|
||||||
func BenchmarkDscalIncToN2Inc10(b *testing.B) { benchmarkDscalIncTo(b, 2, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncToN3Inc1(b *testing.B) { benchmarkDscalIncTo(b, 3, 1) }
|
|
||||||
func BenchmarkDscalIncToN3Inc2(b *testing.B) { benchmarkDscalIncTo(b, 3, 2) }
|
|
||||||
func BenchmarkDscalIncToN3Inc4(b *testing.B) { benchmarkDscalIncTo(b, 3, 4) }
|
|
||||||
func BenchmarkDscalIncToN3Inc10(b *testing.B) { benchmarkDscalIncTo(b, 3, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncToN4Inc1(b *testing.B) { benchmarkDscalIncTo(b, 4, 1) }
|
|
||||||
func BenchmarkDscalIncToN4Inc2(b *testing.B) { benchmarkDscalIncTo(b, 4, 2) }
|
|
||||||
func BenchmarkDscalIncToN4Inc4(b *testing.B) { benchmarkDscalIncTo(b, 4, 4) }
|
|
||||||
func BenchmarkDscalIncToN4Inc10(b *testing.B) { benchmarkDscalIncTo(b, 4, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncToN10Inc1(b *testing.B) { benchmarkDscalIncTo(b, 10, 1) }
|
|
||||||
func BenchmarkDscalIncToN10Inc2(b *testing.B) { benchmarkDscalIncTo(b, 10, 2) }
|
|
||||||
func BenchmarkDscalIncToN10Inc4(b *testing.B) { benchmarkDscalIncTo(b, 10, 4) }
|
|
||||||
func BenchmarkDscalIncToN10Inc10(b *testing.B) { benchmarkDscalIncTo(b, 10, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncToN1000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 1000, 1) }
|
|
||||||
func BenchmarkDscalIncToN1000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 1000, 2) }
|
|
||||||
func BenchmarkDscalIncToN1000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 1000, 4) }
|
|
||||||
func BenchmarkDscalIncToN1000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 1000, 10) }
|
|
||||||
|
|
||||||
func BenchmarkDscalIncToN100000Inc1(b *testing.B) { benchmarkDscalIncTo(b, 100000, 1) }
|
|
||||||
func BenchmarkDscalIncToN100000Inc2(b *testing.B) { benchmarkDscalIncTo(b, 100000, 2) }
|
|
||||||
func BenchmarkDscalIncToN100000Inc4(b *testing.B) { benchmarkDscalIncTo(b, 100000, 4) }
|
|
||||||
func BenchmarkDscalIncToN100000Inc10(b *testing.B) { benchmarkDscalIncTo(b, 100000, 10) }
|
|
||||||
|
|
||||||
func benchmarkDscalIncTo(b *testing.B, n, inc int) {
|
|
||||||
x := randomSlice(n, inc)
|
|
||||||
dst := randomSlice(n, inc)
|
|
||||||
a := rand.Float64()
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
ScalIncTo(dst, uintptr(inc), a, x, uintptr(n), uintptr(inc))
|
|
||||||
}
|
|
||||||
benchSink = dst
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -38,43 +38,76 @@
|
|||||||
|
|
||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
// func DscalInc(alpha float64, x []float64, n, incX uintptr)
|
#define X_PTR SI
|
||||||
|
#define LEN CX
|
||||||
|
#define TAIL BX
|
||||||
|
#define INC_X R8
|
||||||
|
#define INCx3_X R9
|
||||||
|
#define ALPHA X0
|
||||||
|
#define ALPHA_2 X1
|
||||||
|
|
||||||
|
// func ScalInc(alpha float64, x []float64, n, incX uintptr)
|
||||||
TEXT ·ScalInc(SB), NOSPLIT, $0
|
TEXT ·ScalInc(SB), NOSPLIT, $0
|
||||||
MOVHPD alpha+0(FP), X7
|
MOVSD alpha+0(FP), ALPHA // ALPHA = alpha
|
||||||
MOVLPD alpha+0(FP), X7
|
MOVQ x_base+8(FP), X_PTR // X_PTR = &x
|
||||||
MOVQ x+8(FP), R8
|
MOVQ incX+40(FP), INC_X // INC_X = incX
|
||||||
MOVQ n+32(FP), DX
|
SHLQ $3, INC_X // INC_X *= sizeof(float64)
|
||||||
MOVQ incX+40(FP), R10
|
MOVQ n+32(FP), LEN // LEN = n
|
||||||
|
CMPQ LEN, $0
|
||||||
|
JE end // if LEN == 0 { return }
|
||||||
|
|
||||||
MOVQ $0, SI
|
MOVQ LEN, TAIL
|
||||||
MOVQ R10, AX // nextX = incX
|
ANDQ $3, TAIL // TAIL = LEN % 4
|
||||||
SHLQ $1, R10 // incX *= 2
|
SHRQ $2, LEN // LEN = floor( LEN / 4 )
|
||||||
|
JZ tail_start // if LEN == 0 { goto tail_start }
|
||||||
|
|
||||||
SUBQ $2, DX // n -= 2
|
MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
|
||||||
JL tail // if n < 0
|
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
|
||||||
|
|
||||||
loop:
|
loop: // do { // x[i] *= alpha unrolled 4x.
|
||||||
// x[i] *= alpha unrolled 2x.
|
MOVSD (X_PTR), X2 // X_i = x[i]
|
||||||
MOVHPD 0(R8)(SI*8), X0
|
MOVSD (X_PTR)(INC_X*1), X3
|
||||||
MOVLPD 0(R8)(AX*8), X0
|
MOVSD (X_PTR)(INC_X*2), X4
|
||||||
MULPD X7, X0
|
MOVSD (X_PTR)(INCx3_X*1), X5
|
||||||
MOVHPD X0, 0(R8)(SI*8)
|
|
||||||
MOVLPD X0, 0(R8)(AX*8)
|
|
||||||
|
|
||||||
ADDQ R10, SI // ix += incX
|
MULSD ALPHA, X2 // X_i *= a
|
||||||
ADDQ R10, AX // nextX += incX
|
MULSD ALPHA_2, X3
|
||||||
|
MULSD ALPHA, X4
|
||||||
|
MULSD ALPHA_2, X5
|
||||||
|
|
||||||
SUBQ $2, DX // n -= 2
|
MOVSD X2, (X_PTR) // x[i] = X_i
|
||||||
JGE loop // if n >= 0 goto loop
|
MOVSD X3, (X_PTR)(INC_X*1)
|
||||||
|
MOVSD X4, (X_PTR)(INC_X*2)
|
||||||
|
MOVSD X5, (X_PTR)(INCx3_X*1)
|
||||||
|
|
||||||
tail:
|
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
|
||||||
ADDQ $2, DX // n += 2
|
DECQ LEN
|
||||||
JLE end // if n <= 0
|
JNZ loop // } while --LEN > 0
|
||||||
|
CMPQ TAIL, $0
|
||||||
|
JE end // if TAIL == 0 { return }
|
||||||
|
|
||||||
// x[i] *= alpha for the last iteration if n is odd.
|
tail_start: // Reset loop registers
|
||||||
MOVSD 0(R8)(SI*8), X0
|
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
|
||||||
MULSD X7, X0
|
SHRQ $1, LEN // LEN = floor( LEN / 2 )
|
||||||
MOVSD X0, 0(R8)(SI*8)
|
JZ tail_one
|
||||||
|
|
||||||
|
tail_two: // do {
|
||||||
|
MOVSD (X_PTR), X2 // X_i = x[i]
|
||||||
|
MOVSD (X_PTR)(INC_X*1), X3
|
||||||
|
MULSD ALPHA, X2 // X_i *= a
|
||||||
|
MULSD ALPHA, X3
|
||||||
|
MOVSD X2, (X_PTR) // x[i] = X_i
|
||||||
|
MOVSD X3, (X_PTR)(INC_X*1)
|
||||||
|
|
||||||
|
LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
|
||||||
|
|
||||||
|
ANDQ $1, TAIL
|
||||||
|
JZ end
|
||||||
|
|
||||||
|
tail_one:
|
||||||
|
MOVSD (X_PTR), X2 // X_i = x[i]
|
||||||
|
MULSD ALPHA, X2 // X_i *= ALPHA
|
||||||
|
MOVSD X2, (X_PTR) // x[i] = X_i
|
||||||
|
|
||||||
end:
|
end:
|
||||||
RET
|
RET
|
||||||
|
|||||||
@@ -38,50 +38,85 @@
|
|||||||
|
|
||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
// func DscalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr)
|
#define X_PTR SI
|
||||||
|
#define DST_PTR DI
|
||||||
|
#define LEN CX
|
||||||
|
#define TAIL BX
|
||||||
|
#define INC_X R8
|
||||||
|
#define INCx3_X R9
|
||||||
|
#define INC_DST R10
|
||||||
|
#define INCx3_DST R11
|
||||||
|
#define ALPHA X0
|
||||||
|
#define ALPHA_2 X1
|
||||||
|
|
||||||
|
// func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr)
|
||||||
TEXT ·ScalIncTo(SB), NOSPLIT, $0
|
TEXT ·ScalIncTo(SB), NOSPLIT, $0
|
||||||
MOVQ dst+0(FP), R9
|
MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst
|
||||||
MOVQ incDst+24(FP), R11
|
MOVQ incDst+24(FP), INC_DST // INC_DST = incDst
|
||||||
MOVHPD alpha+32(FP), X7
|
SHLQ $3, INC_DST // INC_DST *= sizeof(float64)
|
||||||
MOVLPD alpha+32(FP), X7
|
MOVSD alpha+32(FP), ALPHA // ALPHA = alpha
|
||||||
MOVQ x+40(FP), R8
|
MOVQ x_base+40(FP), X_PTR // X_PTR = &x
|
||||||
MOVQ n+64(FP), DX
|
MOVQ n+64(FP), LEN // LEN = n
|
||||||
MOVQ incX+72(FP), R10
|
MOVQ incX+72(FP), INC_X // INC_X = incX
|
||||||
|
SHLQ $3, INC_X // INC_X *= sizeof(float64)
|
||||||
|
CMPQ LEN, $0
|
||||||
|
JE end // if LEN == 0 { return }
|
||||||
|
|
||||||
MOVQ $0, SI
|
MOVQ LEN, TAIL
|
||||||
MOVQ $0, DI
|
ANDQ $3, TAIL // TAIL = LEN % 4
|
||||||
MOVQ R10, AX // nextX = incX
|
SHRQ $2, LEN // LEN = floor( LEN / 4 )
|
||||||
MOVQ R11, BX // nextDst = incDst
|
JZ tail_start // if LEN == 0 { goto tail_start }
|
||||||
SHLQ $1, R10 // incX *= 2
|
|
||||||
SHLQ $1, R11 // incDst *= 2
|
|
||||||
|
|
||||||
SUBQ $2, DX // n -= 2
|
MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
|
||||||
JL tail // if n < 0
|
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
|
||||||
|
LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
|
||||||
|
|
||||||
loop:
|
loop: // do { // x[i] *= alpha unrolled 4x.
|
||||||
// dst[i] = alpha * x[i] unrolled 2x.
|
MOVSD (X_PTR), X2 // X_i = x[i]
|
||||||
MOVHPD 0(R8)(SI*8), X0
|
MOVSD (X_PTR)(INC_X*1), X3
|
||||||
MOVLPD 0(R8)(AX*8), X0
|
MOVSD (X_PTR)(INC_X*2), X4
|
||||||
MULPD X7, X0
|
MOVSD (X_PTR)(INCx3_X*1), X5
|
||||||
MOVHPD X0, 0(R9)(DI*8)
|
|
||||||
MOVLPD X0, 0(R9)(BX*8)
|
|
||||||
|
|
||||||
ADDQ R10, SI // ix += incX
|
MULSD ALPHA, X2 // X_i *= a
|
||||||
ADDQ R10, AX // nextX += incX
|
MULSD ALPHA_2, X3
|
||||||
ADDQ R11, DI // idst += incDst
|
MULSD ALPHA, X4
|
||||||
ADDQ R11, BX // nextDst += incDst
|
MULSD ALPHA_2, X5
|
||||||
|
|
||||||
SUBQ $2, DX // n -= 2
|
MOVSD X2, (DST_PTR) // dst[i] = X_i
|
||||||
JGE loop // if n >= 0 goto loop
|
MOVSD X3, (DST_PTR)(INC_DST*1)
|
||||||
|
MOVSD X4, (DST_PTR)(INC_DST*2)
|
||||||
|
MOVSD X5, (DST_PTR)(INCx3_DST*1)
|
||||||
|
|
||||||
tail:
|
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
|
||||||
ADDQ $2, DX // n += 2
|
LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4])
|
||||||
JLE end // if n <= 0
|
DECQ LEN
|
||||||
|
JNZ loop // } while --LEN > 0
|
||||||
|
CMPQ TAIL, $0
|
||||||
|
JE end // if TAIL == 0 { return }
|
||||||
|
|
||||||
// dst[i] = alpha * x[i] for the last iteration if n is odd.
|
tail_start: // Reset loop registers
|
||||||
MOVSD 0(R8)(SI*8), X0
|
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
|
||||||
MULSD X7, X0
|
SHRQ $1, LEN // LEN = floor( LEN / 2 )
|
||||||
MOVSD X0, 0(R9)(DI*8)
|
JZ tail_one
|
||||||
|
|
||||||
|
tail_two:
|
||||||
|
MOVSD (X_PTR), X2 // X_i = x[i]
|
||||||
|
MOVSD (X_PTR)(INC_X*1), X3
|
||||||
|
MULSD ALPHA, X2 // X_i *= a
|
||||||
|
MULSD ALPHA, X3
|
||||||
|
MOVSD X2, (DST_PTR) // dst[i] = X_i
|
||||||
|
MOVSD X3, (DST_PTR)(INC_DST*1)
|
||||||
|
|
||||||
|
LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
|
||||||
|
LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incDst*2])
|
||||||
|
|
||||||
|
ANDQ $1, TAIL
|
||||||
|
JZ end
|
||||||
|
|
||||||
|
tail_one:
|
||||||
|
MOVSD (X_PTR), X2 // X_i = x[i]
|
||||||
|
MULSD ALPHA, X2 // X_i *= ALPHA
|
||||||
|
MOVSD X2, (DST_PTR) // x[i] = X_i
|
||||||
|
|
||||||
end:
|
end:
|
||||||
RET
|
RET
|
||||||
|
|||||||
@@ -38,43 +38,75 @@
|
|||||||
|
|
||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
// func DscalUnitary(alpha float64, x []float64)
|
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // @ MOVDDUP XMM0, 8[RSP]
|
||||||
|
|
||||||
|
#define X_PTR SI
|
||||||
|
#define DST_PTR DI
|
||||||
|
#define IDX AX
|
||||||
|
#define LEN CX
|
||||||
|
#define TAIL BX
|
||||||
|
#define ALPHA X0
|
||||||
|
#define ALPHA_2 X1
|
||||||
|
|
||||||
|
// func ScalUnitary(alpha float64, x []float64)
|
||||||
TEXT ·ScalUnitary(SB), NOSPLIT, $0
|
TEXT ·ScalUnitary(SB), NOSPLIT, $0
|
||||||
MOVHPD alpha+0(FP), X7
|
MOVDDUP_ALPHA // ALPHA = { alpha, alpha }
|
||||||
MOVLPD alpha+0(FP), X7
|
MOVQ x_base+8(FP), X_PTR // X_PTR = &x
|
||||||
MOVQ x+8(FP), R8
|
MOVQ x_len+16(FP), LEN // LEN = len(x)
|
||||||
MOVQ x_len+16(FP), DI // n = len(x)
|
CMPQ LEN, $0
|
||||||
|
JE end // if LEN == 0 { return }
|
||||||
|
XORQ IDX, IDX // IDX = 0
|
||||||
|
|
||||||
MOVQ $0, SI // i = 0
|
MOVQ LEN, TAIL
|
||||||
SUBQ $4, DI // n -= 4
|
ANDQ $7, TAIL // TAIL = LEN % 8
|
||||||
JL tail // if n < 0 goto tail
|
SHRQ $3, LEN // LEN = floor( LEN / 8 )
|
||||||
|
JZ tail_start // if LEN == 0 { goto tail_start }
|
||||||
|
|
||||||
loop:
|
MOVUPS ALPHA, ALPHA_2
|
||||||
// x[i] *= alpha unrolled 4x.
|
|
||||||
MOVUPD 0(R8)(SI*8), X0
|
|
||||||
MOVUPD 16(R8)(SI*8), X1
|
|
||||||
MULPD X7, X0
|
|
||||||
MULPD X7, X1
|
|
||||||
MOVUPD X0, 0(R8)(SI*8)
|
|
||||||
MOVUPD X1, 16(R8)(SI*8)
|
|
||||||
|
|
||||||
ADDQ $4, SI // i += 4
|
loop: // do { // x[i] *= alpha unrolled 8x.
|
||||||
SUBQ $4, DI // n -= 4
|
MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
|
||||||
JGE loop // if n >= 0 goto loop
|
MOVUPS 16(X_PTR)(IDX*8), X3
|
||||||
|
MOVUPS 32(X_PTR)(IDX*8), X4
|
||||||
|
MOVUPS 48(X_PTR)(IDX*8), X5
|
||||||
|
|
||||||
tail:
|
MULPD ALPHA, X2 // X_i *= ALPHA
|
||||||
ADDQ $4, DI // n += 4
|
MULPD ALPHA_2, X3
|
||||||
JZ end // if n == 0 goto end
|
MULPD ALPHA, X4
|
||||||
|
MULPD ALPHA_2, X5
|
||||||
|
|
||||||
onemore:
|
MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i
|
||||||
// x[i] *= alpha for the remaining 1-3 elements.
|
MOVUPS X3, 16(X_PTR)(IDX*8)
|
||||||
MOVSD 0(R8)(SI*8), X0
|
MOVUPS X4, 32(X_PTR)(IDX*8)
|
||||||
MULSD X7, X0
|
MOVUPS X5, 48(X_PTR)(IDX*8)
|
||||||
MOVSD X0, 0(R8)(SI*8)
|
|
||||||
|
|
||||||
ADDQ $1, SI // i++
|
ADDQ $8, IDX // i += 8
|
||||||
SUBQ $1, DI // n--
|
DECQ LEN
|
||||||
JNZ onemore // if n != 0 goto onemore
|
JNZ loop // while --LEN > 0
|
||||||
|
CMPQ TAIL, $0
|
||||||
|
JE end // if TAIL == 0 { return }
|
||||||
|
|
||||||
|
tail_start: // Reset loop registers
|
||||||
|
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
|
||||||
|
SHRQ $1, LEN // LEN = floor( TAIL / 2 )
|
||||||
|
JZ tail_one // if n == 0 goto end
|
||||||
|
|
||||||
|
tail_two: // do {
|
||||||
|
MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
|
||||||
|
MULPD ALPHA, X2 // X_i *= ALPHA
|
||||||
|
MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i
|
||||||
|
ADDQ $2, IDX // i += 2
|
||||||
|
DECQ LEN
|
||||||
|
JNZ tail_two // while --LEN > 0
|
||||||
|
|
||||||
|
ANDQ $1, TAIL
|
||||||
|
JZ end // if TAIL == 0 { return }
|
||||||
|
|
||||||
|
tail_one:
|
||||||
|
// x[i] *= alpha for the remaining element.
|
||||||
|
MOVSD (X_PTR)(IDX*8), X2
|
||||||
|
MULSD ALPHA, X2
|
||||||
|
MOVSD X2, (X_PTR)(IDX*8)
|
||||||
|
|
||||||
end:
|
end:
|
||||||
RET
|
RET
|
||||||
|
|||||||
@@ -38,45 +38,76 @@
|
|||||||
|
|
||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
// func DscalUnitaryTo(dst []float64, alpha float64, x []float64)
|
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x2024 // @ MOVDDUP 32(SP), X0 /*XMM0, 32[RSP]*/
|
||||||
|
|
||||||
|
#define X_PTR SI
|
||||||
|
#define DST_PTR DI
|
||||||
|
#define IDX AX
|
||||||
|
#define LEN CX
|
||||||
|
#define TAIL BX
|
||||||
|
#define ALPHA X0
|
||||||
|
#define ALPHA_2 X1
|
||||||
|
|
||||||
|
// func ScalUnitaryTo(dst []float64, alpha float64, x []float64)
|
||||||
// This function assumes len(dst) >= len(x).
|
// This function assumes len(dst) >= len(x).
|
||||||
TEXT ·ScalUnitaryTo(SB), NOSPLIT, $0
|
TEXT ·ScalUnitaryTo(SB), NOSPLIT, $0
|
||||||
MOVQ dst+0(FP), R9
|
MOVQ x_base+32(FP), X_PTR // X_PTR = &x
|
||||||
MOVHPD alpha+24(FP), X7
|
MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst
|
||||||
MOVLPD alpha+24(FP), X7
|
MOVDDUP_ALPHA // ALPHA = { alpha, alpha }
|
||||||
MOVQ x+32(FP), R8
|
MOVQ x_len+40(FP), LEN // LEN = len(x)
|
||||||
MOVQ x_len+40(FP), DI // n = len(x)
|
CMPQ LEN, $0
|
||||||
|
JE end // if LEN == 0 { return }
|
||||||
|
|
||||||
MOVQ $0, SI // i = 0
|
XORQ IDX, IDX // IDX = 0
|
||||||
SUBQ $4, DI // n -= 4
|
MOVQ LEN, TAIL
|
||||||
JL tail // if n < 0 goto tail
|
ANDQ $7, TAIL // TAIL = LEN % 8
|
||||||
|
SHRQ $3, LEN // LEN = floor( LEN / 8 )
|
||||||
|
JZ tail_start // if LEN == 0 { goto tail_start }
|
||||||
|
|
||||||
loop:
|
MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
|
||||||
// dst[i] = alpha * x[i] unrolled 4x.
|
|
||||||
MOVUPD 0(R8)(SI*8), X0
|
|
||||||
MOVUPD 16(R8)(SI*8), X1
|
|
||||||
MULPD X7, X0
|
|
||||||
MULPD X7, X1
|
|
||||||
MOVUPD X0, 0(R9)(SI*8)
|
|
||||||
MOVUPD X1, 16(R9)(SI*8)
|
|
||||||
|
|
||||||
ADDQ $4, SI // i += 4
|
loop: // do { // dst[i] = alpha * x[i] unrolled 8x.
|
||||||
SUBQ $4, DI // n -= 4
|
MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
|
||||||
JGE loop // if n >= 0 goto loop
|
MOVUPS 16(X_PTR)(IDX*8), X3
|
||||||
|
MOVUPS 32(X_PTR)(IDX*8), X4
|
||||||
|
MOVUPS 48(X_PTR)(IDX*8), X5
|
||||||
|
|
||||||
tail:
|
MULPD ALPHA, X2 // X_i *= ALPHA
|
||||||
ADDQ $4, DI // n += 4
|
MULPD ALPHA_2, X3
|
||||||
JZ end // if n == 0 goto end
|
MULPD ALPHA, X4
|
||||||
|
MULPD ALPHA_2, X5
|
||||||
|
|
||||||
onemore:
|
MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i
|
||||||
// dst[i] = alpha * x[i] for the remaining 1-3 elements.
|
MOVUPS X3, 16(DST_PTR)(IDX*8)
|
||||||
MOVSD 0(R8)(SI*8), X0
|
MOVUPS X4, 32(DST_PTR)(IDX*8)
|
||||||
MULSD X7, X0
|
MOVUPS X5, 48(DST_PTR)(IDX*8)
|
||||||
MOVSD X0, 0(R9)(SI*8)
|
|
||||||
|
|
||||||
ADDQ $1, SI // i++
|
ADDQ $8, IDX // i += 8
|
||||||
SUBQ $1, DI // n--
|
DECQ LEN
|
||||||
JNZ onemore // if n != 0 goto onemore
|
JNZ loop // while --LEN > 0
|
||||||
|
CMPQ TAIL, $0
|
||||||
|
JE end // if TAIL == 0 { return }
|
||||||
|
|
||||||
|
tail_start: // Reset loop counters
|
||||||
|
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
|
||||||
|
SHRQ $1, LEN // LEN = floor( TAIL / 2 )
|
||||||
|
JZ tail_one // if LEN == 0 { goto tail_one }
|
||||||
|
|
||||||
|
tail_two: // do {
|
||||||
|
MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
|
||||||
|
MULPD ALPHA, X2 // X_i *= ALPHA
|
||||||
|
MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i
|
||||||
|
ADDQ $2, IDX // i += 2
|
||||||
|
DECQ LEN
|
||||||
|
JNZ tail_two // while --LEN > 0
|
||||||
|
|
||||||
|
ANDQ $1, TAIL
|
||||||
|
JZ end // if TAIL == 0 { return }
|
||||||
|
|
||||||
|
tail_one:
|
||||||
|
MOVSD (X_PTR)(IDX*8), X2 // X_i = x[i]
|
||||||
|
MULSD ALPHA, X2 // X_i *= ALPHA
|
||||||
|
MOVSD X2, (DST_PTR)(IDX*8) // dst[i] = X_i
|
||||||
|
|
||||||
end:
|
end:
|
||||||
RET
|
RET
|
||||||
|
|||||||
Reference in New Issue
Block a user