f32 axpy routines with tests and cache alignment.

This commit is contained in:
Chad Kunde
2016-05-18 11:39:26 -07:00
parent b65a9cb692
commit 376bf8a4ad
4 changed files with 279 additions and 0 deletions

56
asm/f32/axpyinc_amd64.s Normal file
View File

@@ -0,0 +1,56 @@
// Copyright ©2016 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine
#include "textflag.h"
// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyInc(SB), NOSPLIT, $0
MOVQ n+56(FP), CX
CMPQ CX, $0
JLE saxyi_end
MOVQ x_base+8(FP), SI
MOVQ y_base+32(FP), DI
MOVQ ix+80(FP), AX
MOVQ iy+88(FP), BX
LEAQ (SI)(AX*4), SI
LEAQ (DI)(BX*4), DI
MOVQ incX+64(FP), AX
MOVQ incY+72(FP), BX
IMULQ $4, AX
IMULQ $4, BX
MOVSS alpha+0(FP), X0
MOVSS X0, X2
XORQ R9, R9
SHRQ $1, CX
CMOVQCS AX, R9
JZ saxyi_odd
saxyi_loop:
MOVSS (SI), X1
MOVSS (SI)(AX*1), X3
MULSS X0, X1
MULSS X2, X3
ADDSS (DI), X1
ADDSS (DI)(BX*1), X3
MOVSS X1, (DI)
MOVSS X3, (DI)(AX*1)
LEAQ (SI)(AX*2), SI
LEAQ (DI)(BX*2), DI
LOOPNE saxyi_loop
CMPQ R9, $0
JE saxyi_end
saxyi_odd:
// Trim odd n
MOVSS (SI), X1
MULSS X0, X1
ADDSS (DI), X1
MOVSS X1, (DI)
ADDQ AX, SI
ADDQ BX, DI
saxyi_end:
RET

View File

@@ -0,0 +1,56 @@
// Copyright ©2016 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine
#include "textflag.h"
// func AxpyUnitary(alpha float32, x, y []float32)
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SI // Load data buffer pointers
MOVQ y_base+32(FP), DI
MOVQ x_len+16(FP), CX // CX = min( len(x), len(y) )
CMPQ y_len+40(FP), CX
CMOVLEQ y_len+40(FP), CX
CMPQ CX, $0
JE caxy_end
MOVSS alpha+0(FP), X0
SHUFPS $0, X0, X0 // Load alpha into X0 4 times
MOVUPS X0, X2 // Copy to X2 for pipelining
XORQ AX, AX // i = 0
PXOR X1, X1 // 2 NOP instructions (PXOR) to align
PXOR X3, X3 // loop to cache line
MOVQ CX, BX
ANDQ $7, BX // BX = len % 8
SHRQ $3, CX // CX = int(len / 8)
CMPQ CX, $0
JE caxy_tail_start
caxy_loop:
MOVUPS (SI)(AX*4), X1 // xmm = x[i:i+4]
MOVUPS 16(SI)(AX*4), X3
MULPS X0, X1 // xmm *= a
MULPS X2, X3
ADDPS (DI)(AX*4), X1 // xmm += y[i:i+4]
ADDPS 16(DI)(AX*4), X3
MOVUPS X1, (DI)(AX*4) // y[i:i+4] = xmm
MOVUPS X3, 16(DI)(AX*4)
ADDQ $8, AX // i+=8
LOOPNE caxy_loop // while (--CX) > 0
CMPQ BX, $0
JE caxy_end
caxy_tail_start:
MOVQ BX, CX
caxy_tail:
MOVSS (SI)(AX*4), X1
MULSS X0, X1
ADDSS (DI)(AX*4), X1
MOVSS X1, (DI)(AX*4)
INCQ AX
LOOPNE caxy_tail
caxy_end:
RET

View File

@@ -0,0 +1,57 @@
// Copyright ©2016 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine
#include "textflag.h"
// func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // Load data buffer pointers
MOVQ x_base+32(FP), SI
MOVQ y_base+56(FP), DX
MOVQ x_len+40(FP), CX // CX = min( len(x), len(y), len(dst) )
CMPQ y_len+64(FP), CX
CMOVLEQ y_len+64(FP), CX
CMPQ dst_len+8(FP), CX
CMOVLEQ dst_len+8(FP), CX
CMPQ CX, $0 // Empty return
JE caxy_end
MOVSS alpha+24(FP), X0
SHUFPS $0, X0, X0 // Load alpha 4 times
MOVUPS X0, X2 // Copy to X2 for pipelining
XORQ AX, AX // i = 0
MOVQ CX, BX
ANDQ $7, BX // BX = len % 8
SHRQ $3, CX // CX = int(len / 8)
CMPQ CX, $0
JE caxy_tail_start
caxy_loop:
MOVUPS (SI)(AX*4), X1 // xmm = x[i:i+4]
MOVUPS 16(SI)(AX*4), X3
MULPS X0, X1 // xmm *= a
MULPS X2, X3
ADDPS (DX)(AX*4), X1 // xmm += y[i:i+4]
ADDPS 16(DX)(AX*4), X3
MOVUPS X1, (DI)(AX*4) // dst[i:i+4] = xmm
MOVUPS X3, 16(DI)(AX*4)
ADDQ $8, AX // i+=8
LOOPNE caxy_loop // while (--CX) > 0
CMPQ BX, $0
JE caxy_end
caxy_tail_start:
MOVQ BX, CX
caxy_tail:
MOVSS (SI)(AX*4), X1
MULSS X0, X1
ADDSS (DX)(AX*4), X1
MOVSS X1, (DI)(AX*4)
INCQ AX
LOOPNE caxy_tail
caxy_end:
RET

110
asm/f32/stubs_test.go Normal file
View File

@@ -0,0 +1,110 @@
// Copyright ©2015 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package f32
import (
"math"
"testing"
)
var (
nan, inf, ninf float32
)
func init() {
nan, inf, ninf = float32(math.NaN()), float32(math.Inf(1)), float32(math.Inf(-1))
}
func diff(x, y float32) bool {
a, b := float64(x), float64(y)
return x != y && !math.IsNaN(a) && !math.IsNaN(b) || (math.IsNaN(a) != math.IsNaN(b))
}
func TestAxpyUnitary(t *testing.T) {
for i, v := range []struct {
a float32
x, y []float32
ex []float32
}{
{0, []float32{}, []float32{}, []float32{}},
{nan, []float32{1, 2, 3}, []float32{1, 2, 3, 4}, []float32{nan, nan, nan}},
{5, []float32{0, 1, 2, 3, 4, 5, 6, 7},
[]float32{2, 3, 4, 5, 6, 7, 8, 9},
[]float32{2, 8, 14, 20, 26, 32, 38, 44}},
{-2, []float32{5, 4, 3}, []float32{1, 3, 5}, []float32{-9, -5, -1}},
} {
AxpyUnitary(v.a, v.x, v.y)
for j := range v.ex {
if diff(v.ex[j], v.y[j]) {
t.Log("Test", i, "Unexpected value at", j, "Got:", v.y[j], "Expected:", v.ex[j])
t.Fail()
}
}
}
}
func TestAxpyUnitaryTo(t *testing.T) {
for i, v := range []struct {
a float32
x, y, dst []float32
ex []float32
}{
{0, []float32{}, []float32{}, []float32{}, []float32{}},
{nan, []float32{1, 2, 3},
[]float32{1, 2, 3, 4},
[]float32{0, 0, 0},
[]float32{nan, nan, nan}},
{5, []float32{0, 1, 2, 3, 4, 5, 6, 7},
[]float32{2, 3, 4, 5, 6, 7, 8, 9},
make([]float32, 8),
[]float32{2, 8, 14, 20, 26, 32, 38, 44}},
{-2, []float32{5, 4, 3},
[]float32{1, 3, 5},
[]float32{0, 0, 0},
[]float32{-9, -5, -1}},
} {
AxpyUnitaryTo(v.dst, v.a, v.x, v.y)
for j := range v.ex {
if diff(v.ex[j], v.dst[j]) {
t.Log("Test", i, "Unexpected value at", j, "Got:", v.dst[j], "Expected:", v.ex[j])
t.Fail()
}
}
}
}
// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
func TestAxpyInc(t *testing.T) {
for i, v := range []struct {
a float32
x, y []float32
ex []float32
n, incX, incY, ix, iy uintptr
}{
{0, []float32{}, []float32{}, []float32{}, 0, 10, 10, 5, 5},
{nan, []float32{1, 2, 3},
[]float32{1, 2, 3, 4},
[]float32{nan, nan, nan}, 3, 1, 1, 0, 0},
{0, []float32{1, 2, 3},
[]float32{1, 2, 3, 4},
[]float32{nan, nan, 3}, 1, 1, 1, 2, 2},
/*{5, []float32{0, 1, 2, 3, 4, 5, 6, 7},
[]float32{2, 3, 4, 5, 6, 7, 8, 9},
make([]float32, 8),
[]float32{2, 8, 14, 20, 26, 32, 38, 44}},
{-2, []float32{5, 4, 3},
[]float32{1, 3, 5},
[]float32{0, 0, 0},
[]float32{-9, -5, -1}},*/
} {
AxpyInc(v.a, v.x, v.y, v.n, v.incX, v.incY, v.ix, v.iy)
for j, k := v.iy, 0; k < int(v.n); j, k = j+v.incY, k+1 {
if diff(v.ex[j], v.y[j]) {
t.Log("Test", i, "Unexpected value at", j, "Got:", v.y[j], "Expected:", v.ex[j])
t.Fail()
}
}
}
}