diff --git a/asm/f64/abssum_amd64.s b/asm/f64/abssum_amd64.s index 916f052d..8233ea58 100644 --- a/asm/f64/abssum_amd64.s +++ b/asm/f64/abssum_amd64.s @@ -8,10 +8,10 @@ // func AbsSum(x []float64) float64 TEXT ·AbsSum(SB), NOSPLIT, $0 - MOVQ x_base+0(FP), SI - MOVQ x_len+8(FP), CX - XORQ AX, AX - PXOR X0, X0 + MOVQ x_base+0(FP), SI // SI := &x + MOVQ x_len+8(FP), CX // CX := len(x) + XORQ AX, AX // i := 0 + PXOR X0, X0 // p_sum_i := 0 PXOR X1, X1 PXOR X2, X2 PXOR X3, X3 @@ -19,59 +19,64 @@ TEXT ·AbsSum(SB), NOSPLIT, $0 PXOR X5, X5 PXOR X6, X6 PXOR X7, X7 - CMPQ CX, $0 + CMPQ CX, $0 // if CX == 0 { return 0 } JE absum_end MOVQ CX, BX - ANDQ $7, BX - SHRQ $3, CX - JZ absum_tail_start + ANDQ $7, BX // BX := CX % 16 + SHRQ $3, CX // CX = floor( CX / 16 ) + JZ absum_tail_start // if CX == 0 { goto absum_tail_start } -absum_loop: - MOVUPS (SI)(AX*8), X8 +absum_loop: // do { + // p_sum += max( p_sum + x[i], p_sum - x[i] ) + MOVUPS (SI)(AX*8), X8 // X_i = x[i:i+1] MOVUPS 16(SI)(AX*8), X9 MOVUPS 32(SI)(AX*8), X10 MOVUPS 48(SI)(AX*8), X11 - ADDPD X8, X0 + ADDPD X8, X0 // p_sum_i += X_i ( positive values ) ADDPD X9, X2 ADDPD X10, X4 ADDPD X11, X6 - SUBPD X8, X1 + SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values ) SUBPD X9, X3 SUBPD X10, X5 SUBPD X11, X7 - MAXPD X1, X0 + MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) ) MAXPD X3, X2 MAXPD X5, X4 MAXPD X7, X6 - MOVAPS X0, X1 + MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i MOVAPS X2, X3 MOVAPS X4, X5 MOVAPS X6, X7 - ADDQ $8, AX - LOOP absum_loop - ADDPD X3, X0 - ADDPD X5, X7 - ADDPD X7, X0 + ADDQ $8, AX // i += 8 + LOOP absum_loop // } while --CX > 0 + + // p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) ) + ADDPD X3, X0 + ADDPD X5, X7 + ADDPD X7, X0 + + // p_sum_0[0] = p_sum_0[0] + p_sum_0[1] MOVAPS X0, X1 - SHUFPD $0x3, X0, X0 + SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 ) ADDSD X1, X0 - MOVSD X0, X1 CMPQ BX, $0 - JE absum_end + JE absum_end // if BX == 0 { goto absum_end } -absum_tail_start: - MOVQ BX, CX - XORPS X8, X8 +absum_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + XORPS X8, X8 // X_8 = 0 -absum_tail: - MOVSD (SI)(AX*8), X8 - ADDSD X8, X0 - SUBSD X8, X1 - MAXSD X1, X0 - MOVSD X0, X1 - INCQ AX - LOOP absum_tail +absum_tail: // do { + // p_sum += max( p_sum + x[i], p_sum - x[i] ) + MOVSD (SI)(AX*8), X8 // X_8 = x[i] + MOVSD X0, X1 // p_sum_1 = p_sum_0 + ADDSD X8, X0 // p_sum_0 += X_8 + SUBSD X8, X1 // p_sum_1 -= X_8 + MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 ) + INCQ AX // i++ + LOOP absum_tail // } while --CX > 0 -absum_end: - MOVSD X1, sum+24(FP) +absum_end: // return p_sum_0 + MOVSD X0, sum+24(FP) RET diff --git a/asm/f64/abssuminc_amd64.s b/asm/f64/abssuminc_amd64.s index 7f18970f..3a9fb76d 100644 --- a/asm/f64/abssuminc_amd64.s +++ b/asm/f64/abssuminc_amd64.s @@ -8,13 +8,13 @@ // func AbsSumInc(x []float64, n, incX int) (sum float64) TEXT ·AbsSumInc(SB), NOSPLIT, $0 - MOVQ x_base+0(FP), SI - MOVQ n+24(FP), CX - MOVQ incX+32(FP), AX + MOVQ x_base+0(FP), SI // SI := &x + MOVQ n+24(FP), CX // CX := len(x) + MOVQ incX+32(FP), AX // AX := increment * sizeof( float64 ) SHLQ $3, AX - MOVQ AX, DX + MOVQ AX, DX // DX := AX * 3 IMULQ $3, DX - PXOR X0, X0 + PXOR X0, X0 // p_sum_i := 0 PXOR X1, X1 PXOR X2, X2 PXOR X3, X3 @@ -22,64 +22,69 @@ TEXT ·AbsSumInc(SB), NOSPLIT, $0 PXOR X5, X5 PXOR X6, X6 PXOR X7, X7 - CMPQ CX, $0 + CMPQ CX, $0 // if CX == 0 { return 0 } JE absum_end MOVQ CX, BX - ANDQ $7, BX - SHRQ $3, CX - JZ absum_tail_start + ANDQ $7, BX // BX := CX % 16 + SHRQ $3, CX // CX = floor( CX / 16 ) + JZ absum_tail_start // if CX == 0 { goto absum_tail_start } -absum_loop: - MOVSD (SI), X8 +absum_loop: // do { + // p_sum = max( p_sum + x[i], p_sum - x[i] ) + MOVSD (SI), X8 // X_i[0] = x[i] MOVSD (SI)(AX*1), X9 MOVSD (SI)(AX*2), X10 MOVSD (SI)(DX*1), X11 - LEAQ (SI)(AX*4), SI - MOVHPD (SI), X8 + LEAQ (SI)(AX*4), SI // SI = SI + 4 + MOVHPD (SI), X8 // X_i[1] = x[i+4] MOVHPD (SI)(AX*1), X9 MOVHPD (SI)(AX*2), X10 MOVHPD (SI)(DX*1), X11 - ADDPD X8, X0 + ADDPD X8, X0 // p_sum_i += X_i ( positive values ) ADDPD X9, X2 ADDPD X10, X4 ADDPD X11, X6 - SUBPD X8, X1 + SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values ) SUBPD X9, X3 SUBPD X10, X5 SUBPD X11, X7 - MAXPD X1, X0 + MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) ) MAXPD X3, X2 MAXPD X5, X4 MAXPD X7, X6 - MOVAPS X0, X1 + MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i MOVAPS X2, X3 MOVAPS X4, X5 MOVAPS X6, X7 - LEAQ (SI)(AX*4), SI - LOOP absum_loop - ADDPD X3, X0 - ADDPD X5, X7 - ADDPD X7, X0 + LEAQ (SI)(AX*4), SI // SI = SI + 4 + LOOP absum_loop // } while --CX > 0 + + // p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) ) + ADDPD X3, X0 + ADDPD X5, X7 + ADDPD X7, X0 + + // p_sum_0[0] = p_sum_0[0] + p_sum_0[1] MOVAPS X0, X1 - SHUFPD $0x3, X0, X0 + SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 ) ADDSD X1, X0 - MOVSD X0, X1 CMPQ BX, $0 - JE absum_end + JE absum_end // if BX == 0 { goto absum_end } -absum_tail_start: - MOVQ BX, CX - XORPS X8, X8 +absum_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + XORPS X8, X8 // X_8 = 0 -absum_tail: - MOVSD (SI), X8 - ADDSD X8, X0 - SUBSD X8, X1 - MAXSD X1, X0 - MOVSD X0, X1 - ADDQ AX, SI - LOOP absum_tail +absum_tail: // do { + // p_sum += max( p_sum + x[i], p_sum - x[i] ) + MOVSD (SI), X8 // X_8 = x[i] + MOVSD X0, X1 // p_sum_1 = p_sum_0 + ADDSD X8, X0 // p_sum_0 += X_8 + SUBSD X8, X1 // p_sum_1 -= X_8 + MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 ) + ADDQ AX, SI // i++ + LOOP absum_tail // } while --CX > 0 -absum_end: - MOVSD X1, sum+40(FP) +absum_end: // return p_sum_0 + MOVSD X0, sum+40(FP) RET diff --git a/asm/f64/add_amd64.s b/asm/f64/add_amd64.s index bd49448e..f30247ba 100644 --- a/asm/f64/add_amd64.s +++ b/asm/f64/add_amd64.s @@ -8,59 +8,59 @@ // func Add(dst, s []float64) TEXT ·Add(SB), NOSPLIT, $0 - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), CX - MOVQ s_base+24(FP), SI - CMPQ s_len+32(FP), CX + MOVQ dst_base+0(FP), DI // DI := &dst + MOVQ dst_len+8(FP), CX // CX := len(dst) + MOVQ s_base+24(FP), SI // SI := &s + CMPQ s_len+32(FP), CX // CX := max( CX, len(s) ) CMOVQLE s_len+32(FP), CX - CMPQ CX, $0 + CMPQ CX, $0 // if CX == 0 { return } JE add_end XORQ AX, AX MOVQ DI, BX - ANDQ $0x0F, BX - JZ add_no_trim + ANDQ $0x0F, BX // BX := &dst & 15 + JZ add_no_trim // if BX == 0 { goto add_no_trim } // Align on 16-bit boundary - MOVSD (DI)(AX*8), X0 - ADDSD (SI)(AX*8), X0 - MOVSD X0, (DI)(AX*8) - INCQ AX - DECQ CX - JE add_end + MOVSD (SI)(AX*8), X0 // X0 = s[i] + ADDSD (DI)(AX*8), X0 // X0 += dst[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // i++ + DECQ CX // --CX + JE add_end // if CX == 0 { return } add_no_trim: MOVQ CX, BX - ANDQ $7, BX - SHRQ $3, CX - JZ add_tail_start + ANDQ $7, BX // BX := CX % 16 + SHRQ $3, CX // CX = floor( CX / 16 ) + JZ add_tail_start // if CX == 0 { goto add_tail_start } -add_loop: // Loop unrolled 8x - MOVUPS (SI)(AX*8), X0 +add_loop: // Loop unrolled 8x do { + MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1] MOVUPS 16(SI)(AX*8), X1 MOVUPS 32(SI)(AX*8), X2 MOVUPS 48(SI)(AX*8), X3 - ADDPD (DI)(AX*8), X0 + ADDPD (DI)(AX*8), X0 // X_i += dst[i:i+1] ADDPD 16(DI)(AX*8), X1 ADDPD 32(DI)(AX*8), X2 ADDPD 48(DI)(AX*8), X3 - MOVUPS X0, (DI)(AX*8) + MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X_i MOVUPS X1, 16(DI)(AX*8) MOVUPS X2, 32(DI)(AX*8) MOVUPS X3, 48(DI)(AX*8) - ADDQ $8, AX - LOOP add_loop - CMPQ BX, $0 + ADDQ $8, AX // i += 8 + LOOP add_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } JE add_end -add_tail_start: - MOVQ BX, CX +add_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX -add_tail: - MOVSD (DI)(AX*8), X0 - ADDSD (SI)(AX*8), X0 - MOVSD X0, (DI)(AX*8) - INCQ AX - LOOP add_tail +add_tail: // do { + MOVSD (SI)(AX*8), X0 // X0 = s[i] + ADDSD (DI)(AX*8), X0 // X0 += dst[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // ++i + LOOP add_tail // } while --CX > 0 add_end: RET diff --git a/asm/f64/addconst_amd64.s b/asm/f64/addconst_amd64.s index 33754455..5f0e9e31 100644 --- a/asm/f64/addconst_amd64.s +++ b/asm/f64/addconst_amd64.s @@ -8,46 +8,46 @@ // func Addconst(alpha float64, x []float64) TEXT ·AddConst(SB), NOSPLIT, $0 - MOVQ x_base+8(FP), SI - MOVQ x_len+16(FP), CX - CMPQ CX, $0 + MOVQ x_base+8(FP), SI // SI := &x + MOVQ x_len+16(FP), CX // CX := len(x) + CMPQ CX, $0 // if len(x) == 0 { return } JE ac_end - MOVSD alpha+0(FP), X4 + MOVSD alpha+0(FP), X4 // X4 = { a, a } SHUFPD $0, X4, X4 - MOVUPS X4, X5 - XORQ AX, AX + MOVUPS X4, X5 // X5 = X4 + XORQ AX, AX // i = 0 MOVQ CX, BX - ANDQ $7, BX - SHRQ $3, CX - JZ ac_tail_start + ANDQ $7, BX // BX := len(x) % 16 + SHRQ $3, CX // CX := floor( CX / 16 ) + JZ ac_tail_start // if CX == 0 { goto ac_tail_start } -ac_loop: - MOVUPS (SI)(AX*8), X0 +ac_loop: // Loop unrolled 8x do { + MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1] MOVUPS 16(SI)(AX*8), X1 MOVUPS 32(SI)(AX*8), X2 MOVUPS 48(SI)(AX*8), X3 - ADDPD X4, X0 + ADDPD X4, X0 // X_i += a ADDPD X5, X1 ADDPD X4, X2 ADDPD X5, X3 - MOVUPS X0, (SI)(AX*8) + MOVUPS X0, (SI)(AX*8) // s[i:i+1] = X_i MOVUPS X1, 16(SI)(AX*8) MOVUPS X2, 32(SI)(AX*8) MOVUPS X3, 48(SI)(AX*8) - ADDQ $8, AX - LOOP ac_loop - CMPQ BX, $0 + ADDQ $8, AX // i += 8 + LOOP ac_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } JE ac_end -ac_tail_start: - MOVQ BX, CX +ac_tail_start: // Reset loop counters + MOVQ BX, CX // Loop counter: CX = BX -ac_tail: - MOVSD (SI)(AX*8), X0 - ADDSD X4, X0 - MOVSD X0, (SI)(AX*8) - INCQ AX - LOOP ac_tail +ac_tail: // do { + MOVSD (SI)(AX*8), X0 // X0 = s[i] + ADDSD X4, X0 // X0 += a + MOVSD X0, (SI)(AX*8) // s[i] = X0 + INCQ AX // ++i + LOOP ac_tail // } while --CX > 0 ac_end: RET diff --git a/asm/f64/cumprod_amd64.s b/asm/f64/cumprod_amd64.s index 76f122da..e826c8af 100644 --- a/asm/f64/cumprod_amd64.s +++ b/asm/f64/cumprod_amd64.s @@ -7,63 +7,65 @@ #include "textflag.h" TEXT ·CumProd(SB), NOSPLIT, $0 - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), CX - MOVQ s_base+24(FP), SI - CMPQ s_len+32(FP), CX + MOVQ dst_base+0(FP), DI // DI := &dst + MOVQ dst_len+8(FP), CX // CX := len(dst) + MOVQ s_base+24(FP), SI // SI := &s + CMPQ s_len+32(FP), CX // CX := max( CX, len(s) ) CMOVQLE s_len+32(FP), CX - MOVQ CX, ret_len+56(FP) - CMPQ CX, $0 - JE cs_end - XORQ AX, AX + MOVQ CX, ret_len+56(FP) // len(ret) = CX + CMPQ CX, $0 // if CX == 0 { return } + JE cp_end + XORQ AX, AX // i := 0 - MOVSD (SI), X5 + MOVSD (SI), X5 // p_prod = { s[0], s[0] } SHUFPD $0, X5, X5 - MOVSD X5, (DI) - INCQ AX - DECQ CX - JZ cs_end + MOVSD X5, (DI) // dst[0] = s[0] + INCQ AX // ++i + DECQ CX // -- CX + JZ cp_end // if CX == 0 { return } MOVQ CX, BX - ANDQ $3, BX - SHRQ $2, CX - JZ cs_tail_start + ANDQ $3, BX // BX := CX % 4 + SHRQ $2, CX // CX = floor( CX / 4 ) + JZ cp_tail_start // if CX == 0 { goto cp_tail_start } -cs_loop: - MOVUPS (SI)(AX*8), X0 +cp_loop: // Loop unrolled 4x do { + MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1] MOVUPS 16(SI)(AX*8), X2 - MOVAPS X0, X1 + MOVAPS X0, X1 // X1 = X0 MOVAPS X2, X3 - SHUFPD $1, X1, X1 + SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] } SHUFPD $1, X3, X3 - MULPD X0, X1 + MULPD X0, X1 // X1 *= X0 MULPD X2, X3 - SHUFPD $2, X1, X0 - SHUFPD $3, X1, X1 + SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] } + SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] } SHUFPD $2, X3, X2 SHUFPD $3, X3, X3 - MULPD X5, X0 - MULPD X1, X5 + MULPD X5, X0 // X0 *= p_prod + MULPD X1, X5 // p_prod *= X1 MULPD X5, X2 - MOVUPS X0, (DI)(AX*8) + MOVUPS X0, (DI)(AX*8) // dst[i] = X0 MOVUPS X2, 16(DI)(AX*8) MULPD X3, X5 - ADDQ $4, AX - LOOP cs_loop - CMPQ BX, $0 - JE cs_end + ADDQ $4, AX // i += 4 + LOOP cp_loop // } while --CX > 0 -cs_tail_start: - MOVQ BX, CX + // if BX == 0 { return } + CMPQ BX, $0 + JE cp_end -cs_tail: - MULSD (SI)(AX*8), X5 - MOVSD X5, (DI)(AX*8) - INCQ AX - LOOP cs_tail +cp_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX -cs_end: - MOVQ DI, ret_base+48(FP) - MOVQ dst_cap+16(FP), SI +cp_tail: // do { + MULSD (SI)(AX*8), X5 // p_prod *= s[i] + MOVSD X5, (DI)(AX*8) // dst[i] = p_prod + INCQ AX // ++i + LOOP cp_tail // } while --CX > 0 + +cp_end: + MOVQ DI, ret_base+48(FP) // &ret = &dst + MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst) MOVQ SI, ret_cap+64(FP) RET diff --git a/asm/f64/cumsum_amd64.s b/asm/f64/cumsum_amd64.s index 8ca114ca..1c106ff2 100644 --- a/asm/f64/cumsum_amd64.s +++ b/asm/f64/cumsum_amd64.s @@ -7,56 +7,58 @@ #include "textflag.h" TEXT ·CumSum(SB), NOSPLIT, $0 - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), CX - MOVQ s_base+24(FP), SI - CMPQ s_len+32(FP), CX + MOVQ dst_base+0(FP), DI // DI := &dst + MOVQ dst_len+8(FP), CX // CX := len(dst) + MOVQ s_base+24(FP), SI // SI := &s + CMPQ s_len+32(FP), CX // CX := max( CX, len(s) ) CMOVQLE s_len+32(FP), CX - MOVQ CX, ret_len+56(FP) - CMPQ CX, $0 + MOVQ CX, ret_len+56(FP) // len(ret) = CX + CMPQ CX, $0 // if CX == 0 { return } JE cs_end - XORQ AX, AX - PXOR X5, X5 + XORQ AX, AX // i := 0 + PXOR X5, X5 // p_sum = 0 MOVQ CX, BX - ANDQ $3, BX - SHRQ $2, CX - JZ cs_tail_start + ANDQ $3, BX // BX := CX % 4 + SHRQ $2, CX // CX = floor( CX / 4 ) + JZ cs_tail_start // if CX == 0 { goto cs_tail_start } -cs_loop: - MOVUPS (SI)(AX*8), X0 +cs_loop: // Loop unrolled 4x do { + MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1] MOVUPS 16(SI)(AX*8), X2 - MOVAPS X0, X1 + MOVAPS X0, X1 // X1 = X0 MOVAPS X2, X3 - SHUFPD $1, X1, X1 + SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] } SHUFPD $1, X3, X3 - ADDPD X0, X1 + ADDPD X0, X1 // X1 += X0 ADDPD X2, X3 - SHUFPD $2, X1, X0 - SHUFPD $3, X1, X1 + SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] } + SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] } SHUFPD $2, X3, X2 SHUFPD $3, X3, X3 - ADDPD X5, X0 - ADDPD X1, X5 + ADDPD X5, X0 // X0 += p_sum + ADDPD X1, X5 // p_sum += X1 ADDPD X5, X2 - MOVUPS X0, (DI)(AX*8) + MOVUPS X0, (DI)(AX*8) // dst[i] = X0 MOVUPS X2, 16(DI)(AX*8) ADDPD X3, X5 - ADDQ $4, AX - LOOP cs_loop - CMPQ BX, $0 - JE cs_end + ADDQ $4, AX // i += 4 + LOOP cs_loop // } while --CX > 0 -cs_tail_start: - MOVQ BX, CX + // if BX == 0 { return } + CMPQ BX, $0 + JE cs_end -cs_tail: - ADDSD (SI)(AX*8), X5 - MOVSD X5, (DI)(AX*8) - INCQ AX - LOOP cs_tail +cs_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + +cs_tail: // do { + ADDSD (SI)(AX*8), X5 // p_sum *= s[i] + MOVSD X5, (DI)(AX*8) // dst[i] = p_sum + INCQ AX // ++i + LOOP cs_tail // } while --CX > 0 cs_end: - MOVQ DI, ret_base+48(FP) - MOVQ dst_cap+16(FP), SI + MOVQ DI, ret_base+48(FP) // &ret = &dst + MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst) MOVQ SI, ret_cap+64(FP) RET diff --git a/asm/f64/div_amd64.s b/asm/f64/div_amd64.s index 0c895dff..9e3d136e 100644 --- a/asm/f64/div_amd64.s +++ b/asm/f64/div_amd64.s @@ -8,59 +8,59 @@ // func Div(dst, s []float64) TEXT ·Div(SB), NOSPLIT, $0 - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), CX - MOVQ s_base+24(FP), SI - CMPQ s_len+32(FP), CX + MOVQ dst_base+0(FP), DI // DI := &dst + MOVQ dst_len+8(FP), CX // CX := len(dst) + MOVQ s_base+24(FP), SI // SI := &s + CMPQ s_len+32(FP), CX // CX = max( CX, len(s) ) CMOVQLE s_len+32(FP), CX - CMPQ CX, $0 + CMPQ CX, $0 // if CX == 0 { return } JE div_end - XORQ AX, AX + XORQ AX, AX // i := 0 MOVQ SI, BX - ANDQ $15, BX - JZ div_no_trim + ANDQ $15, BX // BX := &s & 15 + JZ div_no_trim // if BX == 0 { goto div_no_trim } // Align on 16-bit boundary - MOVSD (DI)(AX*8), X0 - DIVSD (SI)(AX*8), X0 - MOVSD X0, (DI)(AX*8) - INCQ AX - DECQ CX - JZ div_end + MOVSD (DI)(AX*8), X0 // X0 := dst[i] + DIVSD (SI)(AX*8), X0 // X0 /= s[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // ++i + DECQ CX // --CX + JZ div_end // if CX == 0 { return } div_no_trim: MOVQ CX, BX - ANDQ $7, BX - SHRQ $3, CX - JZ div_tail_start + ANDQ $7, BX // BX = CX % 16 + SHRQ $3, CX // CX = floor( CX / 16 ) + JZ div_tail_start // if CX == 0 { goto div_tail_start } -div_loop: // Loop unrolled 8x - MOVUPS (SI)(AX*8), X0 - MOVUPS 16(SI)(AX*8), X1 - MOVUPS 32(SI)(AX*8), X2 - MOVUPS 48(SI)(AX*8), X3 - DIVPD (DI)(AX*8), X0 - DIVPD 16(DI)(AX*8), X1 - DIVPD 32(DI)(AX*8), X2 - DIVPD 48(DI)(AX*8), X3 - MOVUPS X0, (DI)(AX*8) +div_loop: // Loop unrolled 8x do { + MOVUPS (DI)(AX*8), X0 // X0 := dst[i:i+1] + MOVUPS 16(DI)(AX*8), X1 + MOVUPS 32(DI)(AX*8), X2 + MOVUPS 48(DI)(AX*8), X3 + DIVPD (SI)(AX*8), X0 // X0 /= s[i:i+1] + DIVPD 16(SI)(AX*8), X1 + DIVPD 32(SI)(AX*8), X2 + DIVPD 48(SI)(AX*8), X3 + MOVUPS X0, (DI)(AX*8) // dst[i] = X0 MOVUPS X1, 16(DI)(AX*8) MOVUPS X2, 32(DI)(AX*8) MOVUPS X3, 48(DI)(AX*8) - ADDQ $4, AX - LOOP div_loop - CMPQ BX, $0 + ADDQ $8, AX // i += 8 + LOOP div_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } JE div_end -div_tail_start: - MOVQ BX, CX +div_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX -div_tail: - MOVSD (DI)(AX*8), X0 - DIVSD (SI)(AX*8), X0 - MOVSD X0, (DI)(AX*8) - INCQ AX - LOOP div_tail +div_tail: // do { + MOVSD (DI)(AX*8), X0 // X0 = dst[i] + DIVSD (SI)(AX*8), X0 // X0 /= s[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // ++i + LOOP div_tail // } while --CX > 0 div_end: RET diff --git a/asm/f64/divto_amd64.s b/asm/f64/divto_amd64.s index b0463369..b249eece 100644 --- a/asm/f64/divto_amd64.s +++ b/asm/f64/divto_amd64.s @@ -8,66 +8,66 @@ // func DivTo(dst, x, y []float64) TEXT ·DivTo(SB), NOSPLIT, $0 - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), CX - MOVQ x_base+24(FP), SI - MOVQ y_base+48(FP), DX - CMPQ x_len+32(FP), CX + MOVQ dst_base+0(FP), DI // DI := &dst + MOVQ dst_len+8(FP), CX // CX := len(dst) + MOVQ x_base+24(FP), SI // SI := &x + MOVQ y_base+48(FP), DX // DX := &y + CMPQ x_len+32(FP), CX // CX = max( len(dst), len(x), len(y) ) CMOVQLE x_len+32(FP), CX CMPQ y_len+56(FP), CX CMOVQLE y_len+56(FP), CX - MOVQ CX, ret_len+80(FP) - CMPQ CX, $0 + MOVQ CX, ret_len+80(FP) // len(ret) = CX + CMPQ CX, $0 // if CX == 0 { return } JE div_end - XORQ AX, AX + XORQ AX, AX // i := 0 MOVQ DI, BX - ANDQ $15, BX - JZ div_no_trim + ANDQ $15, BX // BX := &dst & OxF + JZ div_no_trim // if BX == 0 { goto div_no_trim } // Align on 16-bit boundary - MOVSD (SI)(AX*8), X0 - DIVSD (DX)(AX*8), X0 - MOVSD X0, (DI)(AX*8) - INCQ AX - DECQ CX - JZ div_end // */ + MOVSD (SI)(AX*8), X0 // X0 := s[i] + DIVSD (DX)(AX*8), X0 // X0 /= t[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // ++i + DECQ CX // --CX + JZ div_end // if CX == 0 { return } div_no_trim: MOVQ CX, BX - ANDQ $7, BX - SHRQ $3, CX - JZ div_tail_start + ANDQ $7, BX // BX = CX % 16 + SHRQ $3, CX // CX = floor( CX / 16 ) + JZ div_tail_start // if CX == 0 { goto div_tail_start } -div_loop: // Unroll 8x - MOVUPS (SI)(AX*8), X0 +div_loop: // Loop unrolled 8x do { + MOVUPS (SI)(AX*8), X0 // X0 := x[i:i+1] MOVUPS 16(SI)(AX*8), X1 MOVUPS 32(SI)(AX*8), X2 MOVUPS 48(SI)(AX*8), X3 - DIVPD (DX)(AX*8), X0 + DIVPD (DX)(AX*8), X0 // X0 /= y[i:i+1] DIVPD 16(DX)(AX*8), X1 DIVPD 32(DX)(AX*8), X2 DIVPD 48(DX)(AX*8), X3 - MOVUPS X0, (DI)(AX*8) + MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X0 MOVUPS X1, 16(DI)(AX*8) MOVUPS X2, 32(DI)(AX*8) MOVUPS X3, 48(DI)(AX*8) - ADDQ $8, AX - LOOP div_loop - CMPQ CX, $0 + ADDQ $8, AX // i += 8 + LOOP div_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } JE div_end -div_tail_start: - MOVQ BX, CX +div_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX -div_tail: - MOVSD (SI)(AX*8), X0 - DIVSD (DX)(AX*8), X0 +div_tail: // do { + MOVSD (SI)(AX*8), X0 // X0 = x[i] + DIVSD (DX)(AX*8), X0 // X0 /= y[i] MOVSD X0, (DI)(AX*8) - INCQ AX - LOOP div_tail + INCQ AX // ++i + LOOP div_tail // } while --CX > 0 div_end: - MOVQ DI, ret_base+72(FP) - MOVQ dst_cap+16(FP), DI + MOVQ DI, ret_base+72(FP) // &ret = &dst + MOVQ dst_cap+16(FP), DI // cap(ret) = cap(dst) MOVQ DI, ret_cap+88(FP) RET diff --git a/asm/f64/l1norm_amd64.s b/asm/f64/l1norm_amd64.s index 0ced8e78..d740c956 100644 --- a/asm/f64/l1norm_amd64.s +++ b/asm/f64/l1norm_amd64.s @@ -8,46 +8,51 @@ // func L1Norm(s, t []float64) float64 TEXT ·L1Norm(SB), NOSPLIT, $0 - MOVQ s_base+0(FP), DI - MOVQ t_base+24(FP), SI - MOVQ s_len+8(FP), DX - CMPQ t_len+32(FP), DX - CMOVQLE t_len+32(FP), DX - PXOR X3, X3 - XORQ AX, AX - CMPQ DX, $1 - JL l1_end - SUBQ $1, DX - JE l1_tail + MOVQ s_base+0(FP), DI // DI := &s + MOVQ t_base+24(FP), SI // SI := &t + MOVQ s_len+8(FP), CX // CX := len(s) + CMPQ t_len+32(FP), CX // CX = max( CX, len(t) ) + CMOVQLE t_len+32(FP), CX + PXOR X3, X3 // norm := 0 + CMPQ CX, $0 // if CX == 0 { return 0 } + JE l1_end + XORQ AX, AX // i := 0 + MOVQ CX, BX + ANDQ $1, BX // BX := CX % 2 + SHRQ $1, CX // CX := floor( CX / 2 ) + JZ l1_tail_start // if CX == 0 { return 0 } -l1_loop: - MOVUPS (SI)(AX*8), X0 - MOVUPS (DI)(AX*8), X1 +l1_loop: // Loop unrolled 2x do { + MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1] + MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1] MOVAPS X0, X2 SUBPD X1, X0 SUBPD X2, X1 - MAXPD X1, X0 - ADDPD X0, X3 - ADDQ $2, AX - CMPQ AX, DX - JL l1_loop - JG l1_end + MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) + ADDPD X0, X3 // norm += X0 + ADDQ $2, AX // i += 2 + LOOP l1_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } + JE l1_end + +l1_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + PXOR X0, X0 // reset X0, X1 to break dependencies + PXOR X1, X1 l1_tail: - PXOR X0, X0 - PXOR X1, X1 - MOVSD (SI)(AX*8), X0 - MOVSD (DI)(AX*8), X1 + MOVSD (SI)(AX*8), X0 // X0 = t[i] + MOVSD (DI)(AX*8), X1 // x1 = s[i] MOVAPD X0, X2 SUBSD X1, X0 SUBSD X2, X1 - MAXSD X1, X0 - ADDSD X0, X3 + MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) + ADDSD X0, X3 // norm += X0 l1_end: MOVAPS X3, X2 SHUFPD $1, X2, X2 - ADDSD X3, X2 - MOVSD X2, ret+48(FP) + ADDSD X3, X2 // X2 = X3[1] + X3[0] + MOVSD X2, ret+48(FP) // return X2 RET diff --git a/asm/f64/linfnorm_amd64.s b/asm/f64/linfnorm_amd64.s index 0d22f163..444c0bbb 100644 --- a/asm/f64/linfnorm_amd64.s +++ b/asm/f64/linfnorm_amd64.s @@ -8,45 +8,50 @@ // func LinfNorm(s, t []float64) float64 TEXT ·LinfNorm(SB), NOSPLIT, $0 - MOVQ s_base+0(FP), DI - MOVQ t_base+24(FP), SI - MOVQ s_len+8(FP), DX - CMPQ t_len+32(FP), DX - CMOVQLE t_len+32(FP), DX - PXOR X3, X3 - XORQ AX, AX - CMPQ DX, $1 - JL l1_end - SUBQ $1, DX - JE l1_tail + MOVQ s_base+0(FP), DI // DI := &s + MOVQ t_base+24(FP), SI // SI := &t + MOVQ s_len+8(FP), CX // CX := len(s) + CMPQ t_len+32(FP), CX // CX = max( CX, len(t) ) + CMOVQLE t_len+32(FP), CX + PXOR X3, X3 // norm := 0 + CMPQ CX, $0 // if CX == 0 { return 0 } + JE l1_end + XORQ AX, AX // i := 0 + MOVQ CX, BX + ANDQ $1, BX // BX := CX % 2 + SHRQ $1, CX // CX := floor( CX / 2 ) + JZ l1_tail_start // if CX == 0 { return 0 } -l1_loop: - MOVUPS (SI)(AX*8), X0 - MOVUPS (DI)(AX*8), X1 +l1_loop: // Loop unrolled 2x do { + MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1] + MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1] MOVAPS X0, X2 SUBPD X1, X0 SUBPD X2, X1 - MAXPD X1, X0 - MAXPD X0, X3 - ADDQ $2, AX - CMPQ AX, DX - JL l1_loop - JG l1_end + MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) + MAXPD X0, X3 // norm = max( norm, X0 ) + ADDQ $2, AX // i += 2 + LOOP l1_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } + JE l1_end + +l1_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + PXOR X0, X0 // reset X0, X1 to break dependencies + PXOR X1, X1 l1_tail: - PXOR X0, X0 - PXOR X1, X1 - MOVSD (SI)(AX*8), X0 - MOVSD (DI)(AX*8), X1 + MOVSD (SI)(AX*8), X0 // X0 = t[i] + MOVSD (DI)(AX*8), X1 // X1 = s[i] MOVAPD X0, X2 SUBSD X1, X0 SUBSD X2, X1 - MAXSD X1, X0 - MAXSD X0, X3 + MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) + MAXSD X0, X3 // norm = max( norm, X0 ) l1_end: MOVAPS X3, X2 SHUFPD $1, X2, X2 - MAXSD X3, X2 - MOVSD X2, ret+48(FP) + MAXSD X3, X2 // X2 = max( X3[1], X3[0] ) + MOVSD X2, ret+48(FP) // return X2 RET diff --git a/asm/f64/stubs_test.go b/asm/f64/stubs_test.go index c32a2711..b330f4f9 100644 --- a/asm/f64/stubs_test.go +++ b/asm/f64/stubs_test.go @@ -447,6 +447,11 @@ func TestDiv(t *testing.T) { src: []float64{1, 2, 3, 4}, expect: []float64{1, 1, 1, 1}, }, + { + dst: []float64{1, 2, 3, 4, 2, 4, 6, 8}, + src: []float64{1, 2, 3, 4, 1, 2, 3, 4}, + expect: []float64{1, 1, 1, 1, 2, 2, 2, 2}, + }, { dst: []float64{2, 4, 6}, src: []float64{1, 2, 3},