Alignment check added to add routine.

This commit is contained in:
Chad Kunde
2016-05-23 02:18:33 -07:00
parent d5bb447188
commit d7bd77f23f
2 changed files with 33 additions and 17 deletions

View File

@@ -9,15 +9,28 @@
// func Add(dst, s []float64) // func Add(dst, s []float64)
TEXT ·Add(SB), NOSPLIT, $0 TEXT ·Add(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), DX MOVQ dst_len+8(FP), CX
MOVQ s_base+24(FP), SI MOVQ s_base+24(FP), SI
CMPQ s_len+32(FP), DX CMPQ s_len+32(FP), CX
CMOVQLE s_len+32(FP), DX CMOVQLE s_len+32(FP), CX
CMPQ DX, $0 CMPQ CX, $0
JE add_end JE add_end
XORQ AX, AX XORQ AX, AX
CMPQ DX, $4 MOVQ DI, BX
JL add_tail ANDQ $15, BX
JZ add_no_trim
MOVSD (DI)(AX*8), X0
ADDSD (SI)(AX*8), X0
MOVSD X0, (DI)(AX*8)
INCQ AX
DECQ CX
JE add_end
add_no_trim:
MOVQ CX, BX
ANDQ $3, BX
SHRQ $2, CX
JZ add_tail_start
add_loop: add_loop:
MOVUPS (SI)(AX*8), X0 MOVUPS (SI)(AX*8), X0
@@ -27,19 +40,19 @@ add_loop:
ADDPD 16(DI)(AX*8), X1 ADDPD 16(DI)(AX*8), X1
MOVUPS X1, 16(DI)(AX*8) MOVUPS X1, 16(DI)(AX*8)
ADDQ $4, AX ADDQ $4, AX
SUBQ $4, DX LOOPNE add_loop
CMPQ DX, $4 CMPQ BX, $0
JGE add_loop
CMPQ DX, $0
JE add_end JE add_end
add_tail_start:
MOVQ BX, CX
add_tail: add_tail:
MOVSD (DI)(AX*8), X0 MOVSD (DI)(AX*8), X0
ADDSD (SI)(AX*8), X0 ADDSD (SI)(AX*8), X0
MOVSD X0, (DI)(AX*8) MOVSD X0, (DI)(AX*8)
INCQ AX INCQ AX
DECQ DX LOOPNE add_tail
JNZ add_tail
add_end: add_end:
RET RET

View File

@@ -32,6 +32,9 @@ func TestAdd(t *testing.T) {
{[]float64{0, 1, 2, 3, 4}, {[]float64{0, 1, 2, 3, 4},
[]float64{-inf, 4, nan, 8, 9}, []float64{-inf, 4, nan, 8, 9},
[]float64{-inf, 5, nan, 11, 13}}, []float64{-inf, 5, nan, 11, 13}},
{make([]float64, 50)[1:49],
make([]float64, 50)[1:49],
make([]float64, 50)[1:49]},
} { } {
Add(v.dst, v.src) Add(v.dst, v.src)
for i := range v.expect { for i := range v.expect {
@@ -70,7 +73,7 @@ func TestCumSum(t *testing.T) {
}{ }{
{[]float64{0}, []float64{1}, []float64{1}}, {[]float64{0}, []float64{1}, []float64{1}},
{[]float64{nan}, []float64{nan}, []float64{nan}}, {[]float64{nan}, []float64{nan}, []float64{nan}},
{[]float64{0, 0, 0}, []float64{1, 2, 3, 4}, []float64{1, 3, 6}}, {[]float64{0, 0, 0}, []float64{1, 2, 3}, []float64{1, 3, 6}},
{[]float64{0, 0, 0, 0}, []float64{1, 2, 3}, []float64{1, 3, 6}}, {[]float64{0, 0, 0, 0}, []float64{1, 2, 3}, []float64{1, 3, 6}},
{[]float64{0, 0, 0, 0}, []float64{1, 2, 3, 4}, []float64{1, 3, 6, 10}}, {[]float64{0, 0, 0, 0}, []float64{1, 2, 3, 4}, []float64{1, 3, 6, 10}},
{[]float64{1, nan, nan, 1, 1}, {[]float64{1, nan, nan, 1, 1},