mirror of
https://github.com/gonum/gonum.git
synced 2025-10-23 15:13:31 +08:00
f32 axpy routines with tests and cache alignment.
This commit is contained in:
56
asm/f32/axpyinc_amd64.s
Normal file
56
asm/f32/axpyinc_amd64.s
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
// Copyright ©2016 The gonum Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
//+build !noasm,!appengine
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
|
||||||
|
TEXT ·AxpyInc(SB), NOSPLIT, $0
|
||||||
|
MOVQ n+56(FP), CX
|
||||||
|
CMPQ CX, $0
|
||||||
|
JLE saxyi_end
|
||||||
|
MOVQ x_base+8(FP), SI
|
||||||
|
MOVQ y_base+32(FP), DI
|
||||||
|
MOVQ ix+80(FP), AX
|
||||||
|
MOVQ iy+88(FP), BX
|
||||||
|
LEAQ (SI)(AX*4), SI
|
||||||
|
LEAQ (DI)(BX*4), DI
|
||||||
|
MOVQ incX+64(FP), AX
|
||||||
|
MOVQ incY+72(FP), BX
|
||||||
|
IMULQ $4, AX
|
||||||
|
IMULQ $4, BX
|
||||||
|
MOVSS alpha+0(FP), X0
|
||||||
|
MOVSS X0, X2
|
||||||
|
XORQ R9, R9
|
||||||
|
SHRQ $1, CX
|
||||||
|
CMOVQCS AX, R9
|
||||||
|
JZ saxyi_odd
|
||||||
|
|
||||||
|
saxyi_loop:
|
||||||
|
MOVSS (SI), X1
|
||||||
|
MOVSS (SI)(AX*1), X3
|
||||||
|
MULSS X0, X1
|
||||||
|
MULSS X2, X3
|
||||||
|
ADDSS (DI), X1
|
||||||
|
ADDSS (DI)(BX*1), X3
|
||||||
|
MOVSS X1, (DI)
|
||||||
|
MOVSS X3, (DI)(AX*1)
|
||||||
|
LEAQ (SI)(AX*2), SI
|
||||||
|
LEAQ (DI)(BX*2), DI
|
||||||
|
LOOPNE saxyi_loop
|
||||||
|
CMPQ R9, $0
|
||||||
|
JE saxyi_end
|
||||||
|
|
||||||
|
saxyi_odd:
|
||||||
|
// Trim odd n
|
||||||
|
MOVSS (SI), X1
|
||||||
|
MULSS X0, X1
|
||||||
|
ADDSS (DI), X1
|
||||||
|
MOVSS X1, (DI)
|
||||||
|
ADDQ AX, SI
|
||||||
|
ADDQ BX, DI
|
||||||
|
|
||||||
|
saxyi_end:
|
||||||
|
RET
|
56
asm/f32/axpyunitary_amd64.s
Normal file
56
asm/f32/axpyunitary_amd64.s
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
// Copyright ©2016 The gonum Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
//+build !noasm,!appengine
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
// func AxpyUnitary(alpha float32, x, y []float32)
|
||||||
|
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
|
||||||
|
MOVQ x_base+8(FP), SI // Load data buffer pointers
|
||||||
|
MOVQ y_base+32(FP), DI
|
||||||
|
MOVQ x_len+16(FP), CX // CX = min( len(x), len(y) )
|
||||||
|
CMPQ y_len+40(FP), CX
|
||||||
|
CMOVLEQ y_len+40(FP), CX
|
||||||
|
CMPQ CX, $0
|
||||||
|
JE caxy_end
|
||||||
|
MOVSS alpha+0(FP), X0
|
||||||
|
SHUFPS $0, X0, X0 // Load alpha into X0 4 times
|
||||||
|
MOVUPS X0, X2 // Copy to X2 for pipelining
|
||||||
|
XORQ AX, AX // i = 0
|
||||||
|
PXOR X1, X1 // 2 NOP instructions (PXOR) to align
|
||||||
|
PXOR X3, X3 // loop to cache line
|
||||||
|
MOVQ CX, BX
|
||||||
|
ANDQ $7, BX // BX = len % 8
|
||||||
|
SHRQ $3, CX // CX = int(len / 8)
|
||||||
|
CMPQ CX, $0
|
||||||
|
JE caxy_tail_start
|
||||||
|
|
||||||
|
caxy_loop:
|
||||||
|
MOVUPS (SI)(AX*4), X1 // xmm = x[i:i+4]
|
||||||
|
MOVUPS 16(SI)(AX*4), X3
|
||||||
|
MULPS X0, X1 // xmm *= a
|
||||||
|
MULPS X2, X3
|
||||||
|
ADDPS (DI)(AX*4), X1 // xmm += y[i:i+4]
|
||||||
|
ADDPS 16(DI)(AX*4), X3
|
||||||
|
MOVUPS X1, (DI)(AX*4) // y[i:i+4] = xmm
|
||||||
|
MOVUPS X3, 16(DI)(AX*4)
|
||||||
|
ADDQ $8, AX // i+=8
|
||||||
|
LOOPNE caxy_loop // while (--CX) > 0
|
||||||
|
CMPQ BX, $0
|
||||||
|
JE caxy_end
|
||||||
|
|
||||||
|
caxy_tail_start:
|
||||||
|
MOVQ BX, CX
|
||||||
|
|
||||||
|
caxy_tail:
|
||||||
|
MOVSS (SI)(AX*4), X1
|
||||||
|
MULSS X0, X1
|
||||||
|
ADDSS (DI)(AX*4), X1
|
||||||
|
MOVSS X1, (DI)(AX*4)
|
||||||
|
INCQ AX
|
||||||
|
LOOPNE caxy_tail
|
||||||
|
|
||||||
|
caxy_end:
|
||||||
|
RET
|
57
asm/f32/axpyunitaryto_amd64.s
Normal file
57
asm/f32/axpyunitaryto_amd64.s
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
// Copyright ©2016 The gonum Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
//+build !noasm,!appengine
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
// func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
|
||||||
|
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
|
||||||
|
MOVQ dst_base+0(FP), DI // Load data buffer pointers
|
||||||
|
MOVQ x_base+32(FP), SI
|
||||||
|
MOVQ y_base+56(FP), DX
|
||||||
|
MOVQ x_len+40(FP), CX // CX = min( len(x), len(y), len(dst) )
|
||||||
|
CMPQ y_len+64(FP), CX
|
||||||
|
CMOVLEQ y_len+64(FP), CX
|
||||||
|
CMPQ dst_len+8(FP), CX
|
||||||
|
CMOVLEQ dst_len+8(FP), CX
|
||||||
|
CMPQ CX, $0 // Empty return
|
||||||
|
JE caxy_end
|
||||||
|
MOVSS alpha+24(FP), X0
|
||||||
|
SHUFPS $0, X0, X0 // Load alpha 4 times
|
||||||
|
MOVUPS X0, X2 // Copy to X2 for pipelining
|
||||||
|
XORQ AX, AX // i = 0
|
||||||
|
MOVQ CX, BX
|
||||||
|
ANDQ $7, BX // BX = len % 8
|
||||||
|
SHRQ $3, CX // CX = int(len / 8)
|
||||||
|
CMPQ CX, $0
|
||||||
|
JE caxy_tail_start
|
||||||
|
|
||||||
|
caxy_loop:
|
||||||
|
MOVUPS (SI)(AX*4), X1 // xmm = x[i:i+4]
|
||||||
|
MOVUPS 16(SI)(AX*4), X3
|
||||||
|
MULPS X0, X1 // xmm *= a
|
||||||
|
MULPS X2, X3
|
||||||
|
ADDPS (DX)(AX*4), X1 // xmm += y[i:i+4]
|
||||||
|
ADDPS 16(DX)(AX*4), X3
|
||||||
|
MOVUPS X1, (DI)(AX*4) // dst[i:i+4] = xmm
|
||||||
|
MOVUPS X3, 16(DI)(AX*4)
|
||||||
|
ADDQ $8, AX // i+=8
|
||||||
|
LOOPNE caxy_loop // while (--CX) > 0
|
||||||
|
CMPQ BX, $0
|
||||||
|
JE caxy_end
|
||||||
|
|
||||||
|
caxy_tail_start:
|
||||||
|
MOVQ BX, CX
|
||||||
|
|
||||||
|
caxy_tail:
|
||||||
|
MOVSS (SI)(AX*4), X1
|
||||||
|
MULSS X0, X1
|
||||||
|
ADDSS (DX)(AX*4), X1
|
||||||
|
MOVSS X1, (DI)(AX*4)
|
||||||
|
INCQ AX
|
||||||
|
LOOPNE caxy_tail
|
||||||
|
|
||||||
|
caxy_end:
|
||||||
|
RET
|
110
asm/f32/stubs_test.go
Normal file
110
asm/f32/stubs_test.go
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
// Copyright ©2015 The gonum Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package f32
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
nan, inf, ninf float32
|
||||||
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
nan, inf, ninf = float32(math.NaN()), float32(math.Inf(1)), float32(math.Inf(-1))
|
||||||
|
}
|
||||||
|
|
||||||
|
func diff(x, y float32) bool {
|
||||||
|
a, b := float64(x), float64(y)
|
||||||
|
return x != y && !math.IsNaN(a) && !math.IsNaN(b) || (math.IsNaN(a) != math.IsNaN(b))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAxpyUnitary(t *testing.T) {
|
||||||
|
for i, v := range []struct {
|
||||||
|
a float32
|
||||||
|
x, y []float32
|
||||||
|
ex []float32
|
||||||
|
}{
|
||||||
|
{0, []float32{}, []float32{}, []float32{}},
|
||||||
|
{nan, []float32{1, 2, 3}, []float32{1, 2, 3, 4}, []float32{nan, nan, nan}},
|
||||||
|
{5, []float32{0, 1, 2, 3, 4, 5, 6, 7},
|
||||||
|
[]float32{2, 3, 4, 5, 6, 7, 8, 9},
|
||||||
|
[]float32{2, 8, 14, 20, 26, 32, 38, 44}},
|
||||||
|
{-2, []float32{5, 4, 3}, []float32{1, 3, 5}, []float32{-9, -5, -1}},
|
||||||
|
} {
|
||||||
|
AxpyUnitary(v.a, v.x, v.y)
|
||||||
|
for j := range v.ex {
|
||||||
|
if diff(v.ex[j], v.y[j]) {
|
||||||
|
t.Log("Test", i, "Unexpected value at", j, "Got:", v.y[j], "Expected:", v.ex[j])
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAxpyUnitaryTo(t *testing.T) {
|
||||||
|
for i, v := range []struct {
|
||||||
|
a float32
|
||||||
|
x, y, dst []float32
|
||||||
|
ex []float32
|
||||||
|
}{
|
||||||
|
{0, []float32{}, []float32{}, []float32{}, []float32{}},
|
||||||
|
{nan, []float32{1, 2, 3},
|
||||||
|
[]float32{1, 2, 3, 4},
|
||||||
|
[]float32{0, 0, 0},
|
||||||
|
[]float32{nan, nan, nan}},
|
||||||
|
{5, []float32{0, 1, 2, 3, 4, 5, 6, 7},
|
||||||
|
[]float32{2, 3, 4, 5, 6, 7, 8, 9},
|
||||||
|
make([]float32, 8),
|
||||||
|
[]float32{2, 8, 14, 20, 26, 32, 38, 44}},
|
||||||
|
{-2, []float32{5, 4, 3},
|
||||||
|
[]float32{1, 3, 5},
|
||||||
|
[]float32{0, 0, 0},
|
||||||
|
[]float32{-9, -5, -1}},
|
||||||
|
} {
|
||||||
|
AxpyUnitaryTo(v.dst, v.a, v.x, v.y)
|
||||||
|
for j := range v.ex {
|
||||||
|
if diff(v.ex[j], v.dst[j]) {
|
||||||
|
t.Log("Test", i, "Unexpected value at", j, "Got:", v.dst[j], "Expected:", v.ex[j])
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
|
||||||
|
func TestAxpyInc(t *testing.T) {
|
||||||
|
for i, v := range []struct {
|
||||||
|
a float32
|
||||||
|
x, y []float32
|
||||||
|
ex []float32
|
||||||
|
n, incX, incY, ix, iy uintptr
|
||||||
|
}{
|
||||||
|
{0, []float32{}, []float32{}, []float32{}, 0, 10, 10, 5, 5},
|
||||||
|
{nan, []float32{1, 2, 3},
|
||||||
|
[]float32{1, 2, 3, 4},
|
||||||
|
[]float32{nan, nan, nan}, 3, 1, 1, 0, 0},
|
||||||
|
{0, []float32{1, 2, 3},
|
||||||
|
[]float32{1, 2, 3, 4},
|
||||||
|
[]float32{nan, nan, 3}, 1, 1, 1, 2, 2},
|
||||||
|
/*{5, []float32{0, 1, 2, 3, 4, 5, 6, 7},
|
||||||
|
[]float32{2, 3, 4, 5, 6, 7, 8, 9},
|
||||||
|
make([]float32, 8),
|
||||||
|
[]float32{2, 8, 14, 20, 26, 32, 38, 44}},
|
||||||
|
{-2, []float32{5, 4, 3},
|
||||||
|
[]float32{1, 3, 5},
|
||||||
|
[]float32{0, 0, 0},
|
||||||
|
[]float32{-9, -5, -1}},*/
|
||||||
|
} {
|
||||||
|
AxpyInc(v.a, v.x, v.y, v.n, v.incX, v.incY, v.ix, v.iy)
|
||||||
|
for j, k := v.iy, 0; k < int(v.n); j, k = j+v.incY, k+1 {
|
||||||
|
if diff(v.ex[j], v.y[j]) {
|
||||||
|
t.Log("Test", i, "Unexpected value at", j, "Got:", v.y[j], "Expected:", v.ex[j])
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user