diff --git a/asm/f64/add_amd64.s b/asm/f64/add_amd64.s index a23c7436..6568765d 100644 --- a/asm/f64/add_amd64.s +++ b/asm/f64/add_amd64.s @@ -9,15 +9,28 @@ // func Add(dst, s []float64) TEXT ·Add(SB), NOSPLIT, $0 MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), DX + MOVQ dst_len+8(FP), CX MOVQ s_base+24(FP), SI - CMPQ s_len+32(FP), DX - CMOVQLE s_len+32(FP), DX - CMPQ DX, $0 + CMPQ s_len+32(FP), CX + CMOVQLE s_len+32(FP), CX + CMPQ CX, $0 JE add_end XORQ AX, AX - CMPQ DX, $4 - JL add_tail + MOVQ DI, BX + ANDQ $15, BX + JZ add_no_trim + MOVSD (DI)(AX*8), X0 + ADDSD (SI)(AX*8), X0 + MOVSD X0, (DI)(AX*8) + INCQ AX + DECQ CX + JE add_end + +add_no_trim: + MOVQ CX, BX + ANDQ $3, BX + SHRQ $2, CX + JZ add_tail_start add_loop: MOVUPS (SI)(AX*8), X0 @@ -27,19 +40,19 @@ add_loop: ADDPD 16(DI)(AX*8), X1 MOVUPS X1, 16(DI)(AX*8) ADDQ $4, AX - SUBQ $4, DX - CMPQ DX, $4 - JGE add_loop - CMPQ DX, $0 + LOOPNE add_loop + CMPQ BX, $0 JE add_end +add_tail_start: + MOVQ BX, CX + add_tail: - MOVSD (DI)(AX*8), X0 - ADDSD (SI)(AX*8), X0 - MOVSD X0, (DI)(AX*8) - INCQ AX - DECQ DX - JNZ add_tail + MOVSD (DI)(AX*8), X0 + ADDSD (SI)(AX*8), X0 + MOVSD X0, (DI)(AX*8) + INCQ AX + LOOPNE add_tail add_end: RET diff --git a/asm/f64/stubs_test.go b/asm/f64/stubs_test.go index 3c6c8602..17890412 100644 --- a/asm/f64/stubs_test.go +++ b/asm/f64/stubs_test.go @@ -32,6 +32,9 @@ func TestAdd(t *testing.T) { {[]float64{0, 1, 2, 3, 4}, []float64{-inf, 4, nan, 8, 9}, []float64{-inf, 5, nan, 11, 13}}, + {make([]float64, 50)[1:49], + make([]float64, 50)[1:49], + make([]float64, 50)[1:49]}, } { Add(v.dst, v.src) for i := range v.expect { @@ -70,7 +73,7 @@ func TestCumSum(t *testing.T) { }{ {[]float64{0}, []float64{1}, []float64{1}}, {[]float64{nan}, []float64{nan}, []float64{nan}}, - {[]float64{0, 0, 0}, []float64{1, 2, 3, 4}, []float64{1, 3, 6}}, + {[]float64{0, 0, 0}, []float64{1, 2, 3}, []float64{1, 3, 6}}, {[]float64{0, 0, 0, 0}, []float64{1, 2, 3}, []float64{1, 3, 6}}, {[]float64{0, 0, 0, 0}, []float64{1, 2, 3, 4}, []float64{1, 3, 6, 10}}, {[]float64{1, nan, nan, 1, 1},