mirror of
https://github.com/gonum/gonum.git
synced 2025-10-22 06:39:26 +08:00
f32 axpy routines with tests and cache alignment.
This commit is contained in:
56
asm/f32/axpyinc_amd64.s
Normal file
56
asm/f32/axpyinc_amd64.s
Normal file
@@ -0,0 +1,56 @@
|
||||
// Copyright ©2016 The gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
|
||||
TEXT ·AxpyInc(SB), NOSPLIT, $0
|
||||
MOVQ n+56(FP), CX
|
||||
CMPQ CX, $0
|
||||
JLE saxyi_end
|
||||
MOVQ x_base+8(FP), SI
|
||||
MOVQ y_base+32(FP), DI
|
||||
MOVQ ix+80(FP), AX
|
||||
MOVQ iy+88(FP), BX
|
||||
LEAQ (SI)(AX*4), SI
|
||||
LEAQ (DI)(BX*4), DI
|
||||
MOVQ incX+64(FP), AX
|
||||
MOVQ incY+72(FP), BX
|
||||
IMULQ $4, AX
|
||||
IMULQ $4, BX
|
||||
MOVSS alpha+0(FP), X0
|
||||
MOVSS X0, X2
|
||||
XORQ R9, R9
|
||||
SHRQ $1, CX
|
||||
CMOVQCS AX, R9
|
||||
JZ saxyi_odd
|
||||
|
||||
saxyi_loop:
|
||||
MOVSS (SI), X1
|
||||
MOVSS (SI)(AX*1), X3
|
||||
MULSS X0, X1
|
||||
MULSS X2, X3
|
||||
ADDSS (DI), X1
|
||||
ADDSS (DI)(BX*1), X3
|
||||
MOVSS X1, (DI)
|
||||
MOVSS X3, (DI)(AX*1)
|
||||
LEAQ (SI)(AX*2), SI
|
||||
LEAQ (DI)(BX*2), DI
|
||||
LOOPNE saxyi_loop
|
||||
CMPQ R9, $0
|
||||
JE saxyi_end
|
||||
|
||||
saxyi_odd:
|
||||
// Trim odd n
|
||||
MOVSS (SI), X1
|
||||
MULSS X0, X1
|
||||
ADDSS (DI), X1
|
||||
MOVSS X1, (DI)
|
||||
ADDQ AX, SI
|
||||
ADDQ BX, DI
|
||||
|
||||
saxyi_end:
|
||||
RET
|
56
asm/f32/axpyunitary_amd64.s
Normal file
56
asm/f32/axpyunitary_amd64.s
Normal file
@@ -0,0 +1,56 @@
|
||||
// Copyright ©2016 The gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func AxpyUnitary(alpha float32, x, y []float32)
|
||||
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
|
||||
MOVQ x_base+8(FP), SI // Load data buffer pointers
|
||||
MOVQ y_base+32(FP), DI
|
||||
MOVQ x_len+16(FP), CX // CX = min( len(x), len(y) )
|
||||
CMPQ y_len+40(FP), CX
|
||||
CMOVLEQ y_len+40(FP), CX
|
||||
CMPQ CX, $0
|
||||
JE caxy_end
|
||||
MOVSS alpha+0(FP), X0
|
||||
SHUFPS $0, X0, X0 // Load alpha into X0 4 times
|
||||
MOVUPS X0, X2 // Copy to X2 for pipelining
|
||||
XORQ AX, AX // i = 0
|
||||
PXOR X1, X1 // 2 NOP instructions (PXOR) to align
|
||||
PXOR X3, X3 // loop to cache line
|
||||
MOVQ CX, BX
|
||||
ANDQ $7, BX // BX = len % 8
|
||||
SHRQ $3, CX // CX = int(len / 8)
|
||||
CMPQ CX, $0
|
||||
JE caxy_tail_start
|
||||
|
||||
caxy_loop:
|
||||
MOVUPS (SI)(AX*4), X1 // xmm = x[i:i+4]
|
||||
MOVUPS 16(SI)(AX*4), X3
|
||||
MULPS X0, X1 // xmm *= a
|
||||
MULPS X2, X3
|
||||
ADDPS (DI)(AX*4), X1 // xmm += y[i:i+4]
|
||||
ADDPS 16(DI)(AX*4), X3
|
||||
MOVUPS X1, (DI)(AX*4) // y[i:i+4] = xmm
|
||||
MOVUPS X3, 16(DI)(AX*4)
|
||||
ADDQ $8, AX // i+=8
|
||||
LOOPNE caxy_loop // while (--CX) > 0
|
||||
CMPQ BX, $0
|
||||
JE caxy_end
|
||||
|
||||
caxy_tail_start:
|
||||
MOVQ BX, CX
|
||||
|
||||
caxy_tail:
|
||||
MOVSS (SI)(AX*4), X1
|
||||
MULSS X0, X1
|
||||
ADDSS (DI)(AX*4), X1
|
||||
MOVSS X1, (DI)(AX*4)
|
||||
INCQ AX
|
||||
LOOPNE caxy_tail
|
||||
|
||||
caxy_end:
|
||||
RET
|
57
asm/f32/axpyunitaryto_amd64.s
Normal file
57
asm/f32/axpyunitaryto_amd64.s
Normal file
@@ -0,0 +1,57 @@
|
||||
// Copyright ©2016 The gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
|
||||
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
|
||||
MOVQ dst_base+0(FP), DI // Load data buffer pointers
|
||||
MOVQ x_base+32(FP), SI
|
||||
MOVQ y_base+56(FP), DX
|
||||
MOVQ x_len+40(FP), CX // CX = min( len(x), len(y), len(dst) )
|
||||
CMPQ y_len+64(FP), CX
|
||||
CMOVLEQ y_len+64(FP), CX
|
||||
CMPQ dst_len+8(FP), CX
|
||||
CMOVLEQ dst_len+8(FP), CX
|
||||
CMPQ CX, $0 // Empty return
|
||||
JE caxy_end
|
||||
MOVSS alpha+24(FP), X0
|
||||
SHUFPS $0, X0, X0 // Load alpha 4 times
|
||||
MOVUPS X0, X2 // Copy to X2 for pipelining
|
||||
XORQ AX, AX // i = 0
|
||||
MOVQ CX, BX
|
||||
ANDQ $7, BX // BX = len % 8
|
||||
SHRQ $3, CX // CX = int(len / 8)
|
||||
CMPQ CX, $0
|
||||
JE caxy_tail_start
|
||||
|
||||
caxy_loop:
|
||||
MOVUPS (SI)(AX*4), X1 // xmm = x[i:i+4]
|
||||
MOVUPS 16(SI)(AX*4), X3
|
||||
MULPS X0, X1 // xmm *= a
|
||||
MULPS X2, X3
|
||||
ADDPS (DX)(AX*4), X1 // xmm += y[i:i+4]
|
||||
ADDPS 16(DX)(AX*4), X3
|
||||
MOVUPS X1, (DI)(AX*4) // dst[i:i+4] = xmm
|
||||
MOVUPS X3, 16(DI)(AX*4)
|
||||
ADDQ $8, AX // i+=8
|
||||
LOOPNE caxy_loop // while (--CX) > 0
|
||||
CMPQ BX, $0
|
||||
JE caxy_end
|
||||
|
||||
caxy_tail_start:
|
||||
MOVQ BX, CX
|
||||
|
||||
caxy_tail:
|
||||
MOVSS (SI)(AX*4), X1
|
||||
MULSS X0, X1
|
||||
ADDSS (DX)(AX*4), X1
|
||||
MOVSS X1, (DI)(AX*4)
|
||||
INCQ AX
|
||||
LOOPNE caxy_tail
|
||||
|
||||
caxy_end:
|
||||
RET
|
110
asm/f32/stubs_test.go
Normal file
110
asm/f32/stubs_test.go
Normal file
@@ -0,0 +1,110 @@
|
||||
// Copyright ©2015 The gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package f32
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var (
|
||||
nan, inf, ninf float32
|
||||
)
|
||||
|
||||
func init() {
|
||||
nan, inf, ninf = float32(math.NaN()), float32(math.Inf(1)), float32(math.Inf(-1))
|
||||
}
|
||||
|
||||
func diff(x, y float32) bool {
|
||||
a, b := float64(x), float64(y)
|
||||
return x != y && !math.IsNaN(a) && !math.IsNaN(b) || (math.IsNaN(a) != math.IsNaN(b))
|
||||
}
|
||||
|
||||
func TestAxpyUnitary(t *testing.T) {
|
||||
for i, v := range []struct {
|
||||
a float32
|
||||
x, y []float32
|
||||
ex []float32
|
||||
}{
|
||||
{0, []float32{}, []float32{}, []float32{}},
|
||||
{nan, []float32{1, 2, 3}, []float32{1, 2, 3, 4}, []float32{nan, nan, nan}},
|
||||
{5, []float32{0, 1, 2, 3, 4, 5, 6, 7},
|
||||
[]float32{2, 3, 4, 5, 6, 7, 8, 9},
|
||||
[]float32{2, 8, 14, 20, 26, 32, 38, 44}},
|
||||
{-2, []float32{5, 4, 3}, []float32{1, 3, 5}, []float32{-9, -5, -1}},
|
||||
} {
|
||||
AxpyUnitary(v.a, v.x, v.y)
|
||||
for j := range v.ex {
|
||||
if diff(v.ex[j], v.y[j]) {
|
||||
t.Log("Test", i, "Unexpected value at", j, "Got:", v.y[j], "Expected:", v.ex[j])
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAxpyUnitaryTo(t *testing.T) {
|
||||
for i, v := range []struct {
|
||||
a float32
|
||||
x, y, dst []float32
|
||||
ex []float32
|
||||
}{
|
||||
{0, []float32{}, []float32{}, []float32{}, []float32{}},
|
||||
{nan, []float32{1, 2, 3},
|
||||
[]float32{1, 2, 3, 4},
|
||||
[]float32{0, 0, 0},
|
||||
[]float32{nan, nan, nan}},
|
||||
{5, []float32{0, 1, 2, 3, 4, 5, 6, 7},
|
||||
[]float32{2, 3, 4, 5, 6, 7, 8, 9},
|
||||
make([]float32, 8),
|
||||
[]float32{2, 8, 14, 20, 26, 32, 38, 44}},
|
||||
{-2, []float32{5, 4, 3},
|
||||
[]float32{1, 3, 5},
|
||||
[]float32{0, 0, 0},
|
||||
[]float32{-9, -5, -1}},
|
||||
} {
|
||||
AxpyUnitaryTo(v.dst, v.a, v.x, v.y)
|
||||
for j := range v.ex {
|
||||
if diff(v.ex[j], v.dst[j]) {
|
||||
t.Log("Test", i, "Unexpected value at", j, "Got:", v.dst[j], "Expected:", v.ex[j])
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
|
||||
func TestAxpyInc(t *testing.T) {
|
||||
for i, v := range []struct {
|
||||
a float32
|
||||
x, y []float32
|
||||
ex []float32
|
||||
n, incX, incY, ix, iy uintptr
|
||||
}{
|
||||
{0, []float32{}, []float32{}, []float32{}, 0, 10, 10, 5, 5},
|
||||
{nan, []float32{1, 2, 3},
|
||||
[]float32{1, 2, 3, 4},
|
||||
[]float32{nan, nan, nan}, 3, 1, 1, 0, 0},
|
||||
{0, []float32{1, 2, 3},
|
||||
[]float32{1, 2, 3, 4},
|
||||
[]float32{nan, nan, 3}, 1, 1, 1, 2, 2},
|
||||
/*{5, []float32{0, 1, 2, 3, 4, 5, 6, 7},
|
||||
[]float32{2, 3, 4, 5, 6, 7, 8, 9},
|
||||
make([]float32, 8),
|
||||
[]float32{2, 8, 14, 20, 26, 32, 38, 44}},
|
||||
{-2, []float32{5, 4, 3},
|
||||
[]float32{1, 3, 5},
|
||||
[]float32{0, 0, 0},
|
||||
[]float32{-9, -5, -1}},*/
|
||||
} {
|
||||
AxpyInc(v.a, v.x, v.y, v.n, v.incX, v.incY, v.ix, v.iy)
|
||||
for j, k := v.iy, 0; k < int(v.n); j, k = j+v.incY, k+1 {
|
||||
if diff(v.ex[j], v.y[j]) {
|
||||
t.Log("Test", i, "Unexpected value at", j, "Got:", v.y[j], "Expected:", v.ex[j])
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user