asm/f64 Comment assembly code

This commit is contained in:
Chad Kunde
2016-06-16 10:25:42 -07:00
parent 2f9b0ec3b3
commit 2d0053676b
11 changed files with 360 additions and 331 deletions

View File

@@ -8,10 +8,10 @@
// func AbsSum(x []float64) float64
TEXT ·AbsSum(SB), NOSPLIT, $0
MOVQ x_base+0(FP), SI
MOVQ x_len+8(FP), CX
XORQ AX, AX
PXOR X0, X0
MOVQ x_base+0(FP), SI // SI := &x
MOVQ x_len+8(FP), CX // CX := len(x)
XORQ AX, AX // i := 0
PXOR X0, X0 // p_sum_i := 0
PXOR X1, X1
PXOR X2, X2
PXOR X3, X3
@@ -19,59 +19,64 @@ TEXT ·AbsSum(SB), NOSPLIT, $0
PXOR X5, X5
PXOR X6, X6
PXOR X7, X7
CMPQ CX, $0
CMPQ CX, $0 // if CX == 0 { return 0 }
JE absum_end
MOVQ CX, BX
ANDQ $7, BX
SHRQ $3, CX
JZ absum_tail_start
ANDQ $7, BX // BX := CX % 16
SHRQ $3, CX // CX = floor( CX / 16 )
JZ absum_tail_start // if CX == 0 { goto absum_tail_start }
absum_loop:
MOVUPS (SI)(AX*8), X8
absum_loop: // do {
// p_sum += max( p_sum + x[i], p_sum - x[i] )
MOVUPS (SI)(AX*8), X8 // X_i = x[i:i+1]
MOVUPS 16(SI)(AX*8), X9
MOVUPS 32(SI)(AX*8), X10
MOVUPS 48(SI)(AX*8), X11
ADDPD X8, X0
ADDPD X8, X0 // p_sum_i += X_i ( positive values )
ADDPD X9, X2
ADDPD X10, X4
ADDPD X11, X6
SUBPD X8, X1
SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values )
SUBPD X9, X3
SUBPD X10, X5
SUBPD X11, X7
MAXPD X1, X0
MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) )
MAXPD X3, X2
MAXPD X5, X4
MAXPD X7, X6
MOVAPS X0, X1
MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i
MOVAPS X2, X3
MOVAPS X4, X5
MOVAPS X6, X7
ADDQ $8, AX
LOOP absum_loop
ADDPD X3, X0
ADDPD X5, X7
ADDPD X7, X0
ADDQ $8, AX // i += 8
LOOP absum_loop // } while --CX > 0
// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
ADDPD X3, X0
ADDPD X5, X7
ADDPD X7, X0
// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
MOVAPS X0, X1
SHUFPD $0x3, X0, X0
SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
ADDSD X1, X0
MOVSD X0, X1
CMPQ BX, $0
JE absum_end
JE absum_end // if BX == 0 { goto absum_end }
absum_tail_start:
MOVQ BX, CX
XORPS X8, X8
absum_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
XORPS X8, X8 // X_8 = 0
absum_tail:
MOVSD (SI)(AX*8), X8
ADDSD X8, X0
SUBSD X8, X1
MAXSD X1, X0
MOVSD X0, X1
INCQ AX
LOOP absum_tail
absum_tail: // do {
// p_sum += max( p_sum + x[i], p_sum - x[i] )
MOVSD (SI)(AX*8), X8 // X_8 = x[i]
MOVSD X0, X1 // p_sum_1 = p_sum_0
ADDSD X8, X0 // p_sum_0 += X_8
SUBSD X8, X1 // p_sum_1 -= X_8
MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 )
INCQ AX // i++
LOOP absum_tail // } while --CX > 0
absum_end:
MOVSD X1, sum+24(FP)
absum_end: // return p_sum_0
MOVSD X0, sum+24(FP)
RET

View File

@@ -8,13 +8,13 @@
// func AbsSumInc(x []float64, n, incX int) (sum float64)
TEXT ·AbsSumInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), SI
MOVQ n+24(FP), CX
MOVQ incX+32(FP), AX
MOVQ x_base+0(FP), SI // SI := &x
MOVQ n+24(FP), CX // CX := len(x)
MOVQ incX+32(FP), AX // AX := increment * sizeof( float64 )
SHLQ $3, AX
MOVQ AX, DX
MOVQ AX, DX // DX := AX * 3
IMULQ $3, DX
PXOR X0, X0
PXOR X0, X0 // p_sum_i := 0
PXOR X1, X1
PXOR X2, X2
PXOR X3, X3
@@ -22,64 +22,69 @@ TEXT ·AbsSumInc(SB), NOSPLIT, $0
PXOR X5, X5
PXOR X6, X6
PXOR X7, X7
CMPQ CX, $0
CMPQ CX, $0 // if CX == 0 { return 0 }
JE absum_end
MOVQ CX, BX
ANDQ $7, BX
SHRQ $3, CX
JZ absum_tail_start
ANDQ $7, BX // BX := CX % 16
SHRQ $3, CX // CX = floor( CX / 16 )
JZ absum_tail_start // if CX == 0 { goto absum_tail_start }
absum_loop:
MOVSD (SI), X8
absum_loop: // do {
// p_sum = max( p_sum + x[i], p_sum - x[i] )
MOVSD (SI), X8 // X_i[0] = x[i]
MOVSD (SI)(AX*1), X9
MOVSD (SI)(AX*2), X10
MOVSD (SI)(DX*1), X11
LEAQ (SI)(AX*4), SI
MOVHPD (SI), X8
LEAQ (SI)(AX*4), SI // SI = SI + 4
MOVHPD (SI), X8 // X_i[1] = x[i+4]
MOVHPD (SI)(AX*1), X9
MOVHPD (SI)(AX*2), X10
MOVHPD (SI)(DX*1), X11
ADDPD X8, X0
ADDPD X8, X0 // p_sum_i += X_i ( positive values )
ADDPD X9, X2
ADDPD X10, X4
ADDPD X11, X6
SUBPD X8, X1
SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values )
SUBPD X9, X3
SUBPD X10, X5
SUBPD X11, X7
MAXPD X1, X0
MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) )
MAXPD X3, X2
MAXPD X5, X4
MAXPD X7, X6
MOVAPS X0, X1
MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i
MOVAPS X2, X3
MOVAPS X4, X5
MOVAPS X6, X7
LEAQ (SI)(AX*4), SI
LOOP absum_loop
ADDPD X3, X0
ADDPD X5, X7
ADDPD X7, X0
LEAQ (SI)(AX*4), SI // SI = SI + 4
LOOP absum_loop // } while --CX > 0
// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
ADDPD X3, X0
ADDPD X5, X7
ADDPD X7, X0
// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
MOVAPS X0, X1
SHUFPD $0x3, X0, X0
SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
ADDSD X1, X0
MOVSD X0, X1
CMPQ BX, $0
JE absum_end
JE absum_end // if BX == 0 { goto absum_end }
absum_tail_start:
MOVQ BX, CX
XORPS X8, X8
absum_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
XORPS X8, X8 // X_8 = 0
absum_tail:
MOVSD (SI), X8
ADDSD X8, X0
SUBSD X8, X1
MAXSD X1, X0
MOVSD X0, X1
ADDQ AX, SI
LOOP absum_tail
absum_tail: // do {
// p_sum += max( p_sum + x[i], p_sum - x[i] )
MOVSD (SI), X8 // X_8 = x[i]
MOVSD X0, X1 // p_sum_1 = p_sum_0
ADDSD X8, X0 // p_sum_0 += X_8
SUBSD X8, X1 // p_sum_1 -= X_8
MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 )
ADDQ AX, SI // i++
LOOP absum_tail // } while --CX > 0
absum_end:
MOVSD X1, sum+40(FP)
absum_end: // return p_sum_0
MOVSD X0, sum+40(FP)
RET

View File

@@ -8,59 +8,59 @@
// func Add(dst, s []float64)
TEXT ·Add(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), CX
MOVQ s_base+24(FP), SI
CMPQ s_len+32(FP), CX
MOVQ dst_base+0(FP), DI // DI := &dst
MOVQ dst_len+8(FP), CX // CX := len(dst)
MOVQ s_base+24(FP), SI // SI := &s
CMPQ s_len+32(FP), CX // CX := max( CX, len(s) )
CMOVQLE s_len+32(FP), CX
CMPQ CX, $0
CMPQ CX, $0 // if CX == 0 { return }
JE add_end
XORQ AX, AX
MOVQ DI, BX
ANDQ $0x0F, BX
JZ add_no_trim
ANDQ $0x0F, BX // BX := &dst & 15
JZ add_no_trim // if BX == 0 { goto add_no_trim }
// Align on 16-bit boundary
MOVSD (DI)(AX*8), X0
ADDSD (SI)(AX*8), X0
MOVSD X0, (DI)(AX*8)
INCQ AX
DECQ CX
JE add_end
MOVSD (SI)(AX*8), X0 // X0 = s[i]
ADDSD (DI)(AX*8), X0 // X0 += dst[i]
MOVSD X0, (DI)(AX*8) // dst[i] = X0
INCQ AX // i++
DECQ CX // --CX
JE add_end // if CX == 0 { return }
add_no_trim:
MOVQ CX, BX
ANDQ $7, BX
SHRQ $3, CX
JZ add_tail_start
ANDQ $7, BX // BX := CX % 16
SHRQ $3, CX // CX = floor( CX / 16 )
JZ add_tail_start // if CX == 0 { goto add_tail_start }
add_loop: // Loop unrolled 8x
MOVUPS (SI)(AX*8), X0
add_loop: // Loop unrolled 8x do {
MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1]
MOVUPS 16(SI)(AX*8), X1
MOVUPS 32(SI)(AX*8), X2
MOVUPS 48(SI)(AX*8), X3
ADDPD (DI)(AX*8), X0
ADDPD (DI)(AX*8), X0 // X_i += dst[i:i+1]
ADDPD 16(DI)(AX*8), X1
ADDPD 32(DI)(AX*8), X2
ADDPD 48(DI)(AX*8), X3
MOVUPS X0, (DI)(AX*8)
MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X_i
MOVUPS X1, 16(DI)(AX*8)
MOVUPS X2, 32(DI)(AX*8)
MOVUPS X3, 48(DI)(AX*8)
ADDQ $8, AX
LOOP add_loop
CMPQ BX, $0
ADDQ $8, AX // i += 8
LOOP add_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE add_end
add_tail_start:
MOVQ BX, CX
add_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
add_tail:
MOVSD (DI)(AX*8), X0
ADDSD (SI)(AX*8), X0
MOVSD X0, (DI)(AX*8)
INCQ AX
LOOP add_tail
add_tail: // do {
MOVSD (SI)(AX*8), X0 // X0 = s[i]
ADDSD (DI)(AX*8), X0 // X0 += dst[i]
MOVSD X0, (DI)(AX*8) // dst[i] = X0
INCQ AX // ++i
LOOP add_tail // } while --CX > 0
add_end:
RET

View File

@@ -8,46 +8,46 @@
// func Addconst(alpha float64, x []float64)
TEXT ·AddConst(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SI
MOVQ x_len+16(FP), CX
CMPQ CX, $0
MOVQ x_base+8(FP), SI // SI := &x
MOVQ x_len+16(FP), CX // CX := len(x)
CMPQ CX, $0 // if len(x) == 0 { return }
JE ac_end
MOVSD alpha+0(FP), X4
MOVSD alpha+0(FP), X4 // X4 = { a, a }
SHUFPD $0, X4, X4
MOVUPS X4, X5
XORQ AX, AX
MOVUPS X4, X5 // X5 = X4
XORQ AX, AX // i = 0
MOVQ CX, BX
ANDQ $7, BX
SHRQ $3, CX
JZ ac_tail_start
ANDQ $7, BX // BX := len(x) % 16
SHRQ $3, CX // CX := floor( CX / 16 )
JZ ac_tail_start // if CX == 0 { goto ac_tail_start }
ac_loop:
MOVUPS (SI)(AX*8), X0
ac_loop: // Loop unrolled 8x do {
MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1]
MOVUPS 16(SI)(AX*8), X1
MOVUPS 32(SI)(AX*8), X2
MOVUPS 48(SI)(AX*8), X3
ADDPD X4, X0
ADDPD X4, X0 // X_i += a
ADDPD X5, X1
ADDPD X4, X2
ADDPD X5, X3
MOVUPS X0, (SI)(AX*8)
MOVUPS X0, (SI)(AX*8) // s[i:i+1] = X_i
MOVUPS X1, 16(SI)(AX*8)
MOVUPS X2, 32(SI)(AX*8)
MOVUPS X3, 48(SI)(AX*8)
ADDQ $8, AX
LOOP ac_loop
CMPQ BX, $0
ADDQ $8, AX // i += 8
LOOP ac_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE ac_end
ac_tail_start:
MOVQ BX, CX
ac_tail_start: // Reset loop counters
MOVQ BX, CX // Loop counter: CX = BX
ac_tail:
MOVSD (SI)(AX*8), X0
ADDSD X4, X0
MOVSD X0, (SI)(AX*8)
INCQ AX
LOOP ac_tail
ac_tail: // do {
MOVSD (SI)(AX*8), X0 // X0 = s[i]
ADDSD X4, X0 // X0 += a
MOVSD X0, (SI)(AX*8) // s[i] = X0
INCQ AX // ++i
LOOP ac_tail // } while --CX > 0
ac_end:
RET

View File

@@ -7,63 +7,65 @@
#include "textflag.h"
TEXT ·CumProd(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), CX
MOVQ s_base+24(FP), SI
CMPQ s_len+32(FP), CX
MOVQ dst_base+0(FP), DI // DI := &dst
MOVQ dst_len+8(FP), CX // CX := len(dst)
MOVQ s_base+24(FP), SI // SI := &s
CMPQ s_len+32(FP), CX // CX := max( CX, len(s) )
CMOVQLE s_len+32(FP), CX
MOVQ CX, ret_len+56(FP)
CMPQ CX, $0
JE cs_end
XORQ AX, AX
MOVQ CX, ret_len+56(FP) // len(ret) = CX
CMPQ CX, $0 // if CX == 0 { return }
JE cp_end
XORQ AX, AX // i := 0
MOVSD (SI), X5
MOVSD (SI), X5 // p_prod = { s[0], s[0] }
SHUFPD $0, X5, X5
MOVSD X5, (DI)
INCQ AX
DECQ CX
JZ cs_end
MOVSD X5, (DI) // dst[0] = s[0]
INCQ AX // ++i
DECQ CX // -- CX
JZ cp_end // if CX == 0 { return }
MOVQ CX, BX
ANDQ $3, BX
SHRQ $2, CX
JZ cs_tail_start
ANDQ $3, BX // BX := CX % 4
SHRQ $2, CX // CX = floor( CX / 4 )
JZ cp_tail_start // if CX == 0 { goto cp_tail_start }
cs_loop:
MOVUPS (SI)(AX*8), X0
cp_loop: // Loop unrolled 4x do {
MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1]
MOVUPS 16(SI)(AX*8), X2
MOVAPS X0, X1
MOVAPS X0, X1 // X1 = X0
MOVAPS X2, X3
SHUFPD $1, X1, X1
SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] }
SHUFPD $1, X3, X3
MULPD X0, X1
MULPD X0, X1 // X1 *= X0
MULPD X2, X3
SHUFPD $2, X1, X0
SHUFPD $3, X1, X1
SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] }
SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] }
SHUFPD $2, X3, X2
SHUFPD $3, X3, X3
MULPD X5, X0
MULPD X1, X5
MULPD X5, X0 // X0 *= p_prod
MULPD X1, X5 // p_prod *= X1
MULPD X5, X2
MOVUPS X0, (DI)(AX*8)
MOVUPS X0, (DI)(AX*8) // dst[i] = X0
MOVUPS X2, 16(DI)(AX*8)
MULPD X3, X5
ADDQ $4, AX
LOOP cs_loop
CMPQ BX, $0
JE cs_end
ADDQ $4, AX // i += 4
LOOP cp_loop // } while --CX > 0
cs_tail_start:
MOVQ BX, CX
// if BX == 0 { return }
CMPQ BX, $0
JE cp_end
cs_tail:
MULSD (SI)(AX*8), X5
MOVSD X5, (DI)(AX*8)
INCQ AX
LOOP cs_tail
cp_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
cs_end:
MOVQ DI, ret_base+48(FP)
MOVQ dst_cap+16(FP), SI
cp_tail: // do {
MULSD (SI)(AX*8), X5 // p_prod *= s[i]
MOVSD X5, (DI)(AX*8) // dst[i] = p_prod
INCQ AX // ++i
LOOP cp_tail // } while --CX > 0
cp_end:
MOVQ DI, ret_base+48(FP) // &ret = &dst
MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst)
MOVQ SI, ret_cap+64(FP)
RET

View File

@@ -7,56 +7,58 @@
#include "textflag.h"
TEXT ·CumSum(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), CX
MOVQ s_base+24(FP), SI
CMPQ s_len+32(FP), CX
MOVQ dst_base+0(FP), DI // DI := &dst
MOVQ dst_len+8(FP), CX // CX := len(dst)
MOVQ s_base+24(FP), SI // SI := &s
CMPQ s_len+32(FP), CX // CX := max( CX, len(s) )
CMOVQLE s_len+32(FP), CX
MOVQ CX, ret_len+56(FP)
CMPQ CX, $0
MOVQ CX, ret_len+56(FP) // len(ret) = CX
CMPQ CX, $0 // if CX == 0 { return }
JE cs_end
XORQ AX, AX
PXOR X5, X5
XORQ AX, AX // i := 0
PXOR X5, X5 // p_sum = 0
MOVQ CX, BX
ANDQ $3, BX
SHRQ $2, CX
JZ cs_tail_start
ANDQ $3, BX // BX := CX % 4
SHRQ $2, CX // CX = floor( CX / 4 )
JZ cs_tail_start // if CX == 0 { goto cs_tail_start }
cs_loop:
MOVUPS (SI)(AX*8), X0
cs_loop: // Loop unrolled 4x do {
MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1]
MOVUPS 16(SI)(AX*8), X2
MOVAPS X0, X1
MOVAPS X0, X1 // X1 = X0
MOVAPS X2, X3
SHUFPD $1, X1, X1
SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] }
SHUFPD $1, X3, X3
ADDPD X0, X1
ADDPD X0, X1 // X1 += X0
ADDPD X2, X3
SHUFPD $2, X1, X0
SHUFPD $3, X1, X1
SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] }
SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] }
SHUFPD $2, X3, X2
SHUFPD $3, X3, X3
ADDPD X5, X0
ADDPD X1, X5
ADDPD X5, X0 // X0 += p_sum
ADDPD X1, X5 // p_sum += X1
ADDPD X5, X2
MOVUPS X0, (DI)(AX*8)
MOVUPS X0, (DI)(AX*8) // dst[i] = X0
MOVUPS X2, 16(DI)(AX*8)
ADDPD X3, X5
ADDQ $4, AX
LOOP cs_loop
CMPQ BX, $0
JE cs_end
ADDQ $4, AX // i += 4
LOOP cs_loop // } while --CX > 0
cs_tail_start:
MOVQ BX, CX
// if BX == 0 { return }
CMPQ BX, $0
JE cs_end
cs_tail:
ADDSD (SI)(AX*8), X5
MOVSD X5, (DI)(AX*8)
INCQ AX
LOOP cs_tail
cs_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
cs_tail: // do {
ADDSD (SI)(AX*8), X5 // p_sum *= s[i]
MOVSD X5, (DI)(AX*8) // dst[i] = p_sum
INCQ AX // ++i
LOOP cs_tail // } while --CX > 0
cs_end:
MOVQ DI, ret_base+48(FP)
MOVQ dst_cap+16(FP), SI
MOVQ DI, ret_base+48(FP) // &ret = &dst
MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst)
MOVQ SI, ret_cap+64(FP)
RET

View File

@@ -8,59 +8,59 @@
// func Div(dst, s []float64)
TEXT ·Div(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), CX
MOVQ s_base+24(FP), SI
CMPQ s_len+32(FP), CX
MOVQ dst_base+0(FP), DI // DI := &dst
MOVQ dst_len+8(FP), CX // CX := len(dst)
MOVQ s_base+24(FP), SI // SI := &s
CMPQ s_len+32(FP), CX // CX = max( CX, len(s) )
CMOVQLE s_len+32(FP), CX
CMPQ CX, $0
CMPQ CX, $0 // if CX == 0 { return }
JE div_end
XORQ AX, AX
XORQ AX, AX // i := 0
MOVQ SI, BX
ANDQ $15, BX
JZ div_no_trim
ANDQ $15, BX // BX := &s & 15
JZ div_no_trim // if BX == 0 { goto div_no_trim }
// Align on 16-bit boundary
MOVSD (DI)(AX*8), X0
DIVSD (SI)(AX*8), X0
MOVSD X0, (DI)(AX*8)
INCQ AX
DECQ CX
JZ div_end
MOVSD (DI)(AX*8), X0 // X0 := dst[i]
DIVSD (SI)(AX*8), X0 // X0 /= s[i]
MOVSD X0, (DI)(AX*8) // dst[i] = X0
INCQ AX // ++i
DECQ CX // --CX
JZ div_end // if CX == 0 { return }
div_no_trim:
MOVQ CX, BX
ANDQ $7, BX
SHRQ $3, CX
JZ div_tail_start
ANDQ $7, BX // BX = CX % 16
SHRQ $3, CX // CX = floor( CX / 16 )
JZ div_tail_start // if CX == 0 { goto div_tail_start }
div_loop: // Loop unrolled 8x
MOVUPS (SI)(AX*8), X0
MOVUPS 16(SI)(AX*8), X1
MOVUPS 32(SI)(AX*8), X2
MOVUPS 48(SI)(AX*8), X3
DIVPD (DI)(AX*8), X0
DIVPD 16(DI)(AX*8), X1
DIVPD 32(DI)(AX*8), X2
DIVPD 48(DI)(AX*8), X3
MOVUPS X0, (DI)(AX*8)
div_loop: // Loop unrolled 8x do {
MOVUPS (DI)(AX*8), X0 // X0 := dst[i:i+1]
MOVUPS 16(DI)(AX*8), X1
MOVUPS 32(DI)(AX*8), X2
MOVUPS 48(DI)(AX*8), X3
DIVPD (SI)(AX*8), X0 // X0 /= s[i:i+1]
DIVPD 16(SI)(AX*8), X1
DIVPD 32(SI)(AX*8), X2
DIVPD 48(SI)(AX*8), X3
MOVUPS X0, (DI)(AX*8) // dst[i] = X0
MOVUPS X1, 16(DI)(AX*8)
MOVUPS X2, 32(DI)(AX*8)
MOVUPS X3, 48(DI)(AX*8)
ADDQ $4, AX
LOOP div_loop
CMPQ BX, $0
ADDQ $8, AX // i += 8
LOOP div_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE div_end
div_tail_start:
MOVQ BX, CX
div_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
div_tail:
MOVSD (DI)(AX*8), X0
DIVSD (SI)(AX*8), X0
MOVSD X0, (DI)(AX*8)
INCQ AX
LOOP div_tail
div_tail: // do {
MOVSD (DI)(AX*8), X0 // X0 = dst[i]
DIVSD (SI)(AX*8), X0 // X0 /= s[i]
MOVSD X0, (DI)(AX*8) // dst[i] = X0
INCQ AX // ++i
LOOP div_tail // } while --CX > 0
div_end:
RET

View File

@@ -8,66 +8,66 @@
// func DivTo(dst, x, y []float64)
TEXT ·DivTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), CX
MOVQ x_base+24(FP), SI
MOVQ y_base+48(FP), DX
CMPQ x_len+32(FP), CX
MOVQ dst_base+0(FP), DI // DI := &dst
MOVQ dst_len+8(FP), CX // CX := len(dst)
MOVQ x_base+24(FP), SI // SI := &x
MOVQ y_base+48(FP), DX // DX := &y
CMPQ x_len+32(FP), CX // CX = max( len(dst), len(x), len(y) )
CMOVQLE x_len+32(FP), CX
CMPQ y_len+56(FP), CX
CMOVQLE y_len+56(FP), CX
MOVQ CX, ret_len+80(FP)
CMPQ CX, $0
MOVQ CX, ret_len+80(FP) // len(ret) = CX
CMPQ CX, $0 // if CX == 0 { return }
JE div_end
XORQ AX, AX
XORQ AX, AX // i := 0
MOVQ DI, BX
ANDQ $15, BX
JZ div_no_trim
ANDQ $15, BX // BX := &dst & OxF
JZ div_no_trim // if BX == 0 { goto div_no_trim }
// Align on 16-bit boundary
MOVSD (SI)(AX*8), X0
DIVSD (DX)(AX*8), X0
MOVSD X0, (DI)(AX*8)
INCQ AX
DECQ CX
JZ div_end // */
MOVSD (SI)(AX*8), X0 // X0 := s[i]
DIVSD (DX)(AX*8), X0 // X0 /= t[i]
MOVSD X0, (DI)(AX*8) // dst[i] = X0
INCQ AX // ++i
DECQ CX // --CX
JZ div_end // if CX == 0 { return }
div_no_trim:
MOVQ CX, BX
ANDQ $7, BX
SHRQ $3, CX
JZ div_tail_start
ANDQ $7, BX // BX = CX % 16
SHRQ $3, CX // CX = floor( CX / 16 )
JZ div_tail_start // if CX == 0 { goto div_tail_start }
div_loop: // Unroll 8x
MOVUPS (SI)(AX*8), X0
div_loop: // Loop unrolled 8x do {
MOVUPS (SI)(AX*8), X0 // X0 := x[i:i+1]
MOVUPS 16(SI)(AX*8), X1
MOVUPS 32(SI)(AX*8), X2
MOVUPS 48(SI)(AX*8), X3
DIVPD (DX)(AX*8), X0
DIVPD (DX)(AX*8), X0 // X0 /= y[i:i+1]
DIVPD 16(DX)(AX*8), X1
DIVPD 32(DX)(AX*8), X2
DIVPD 48(DX)(AX*8), X3
MOVUPS X0, (DI)(AX*8)
MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X0
MOVUPS X1, 16(DI)(AX*8)
MOVUPS X2, 32(DI)(AX*8)
MOVUPS X3, 48(DI)(AX*8)
ADDQ $8, AX
LOOP div_loop
CMPQ CX, $0
ADDQ $8, AX // i += 8
LOOP div_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE div_end
div_tail_start:
MOVQ BX, CX
div_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
div_tail:
MOVSD (SI)(AX*8), X0
DIVSD (DX)(AX*8), X0
div_tail: // do {
MOVSD (SI)(AX*8), X0 // X0 = x[i]
DIVSD (DX)(AX*8), X0 // X0 /= y[i]
MOVSD X0, (DI)(AX*8)
INCQ AX
LOOP div_tail
INCQ AX // ++i
LOOP div_tail // } while --CX > 0
div_end:
MOVQ DI, ret_base+72(FP)
MOVQ dst_cap+16(FP), DI
MOVQ DI, ret_base+72(FP) // &ret = &dst
MOVQ dst_cap+16(FP), DI // cap(ret) = cap(dst)
MOVQ DI, ret_cap+88(FP)
RET

View File

@@ -8,46 +8,51 @@
// func L1Norm(s, t []float64) float64
TEXT ·L1Norm(SB), NOSPLIT, $0
MOVQ s_base+0(FP), DI
MOVQ t_base+24(FP), SI
MOVQ s_len+8(FP), DX
CMPQ t_len+32(FP), DX
CMOVQLE t_len+32(FP), DX
PXOR X3, X3
XORQ AX, AX
CMPQ DX, $1
JL l1_end
SUBQ $1, DX
JE l1_tail
MOVQ s_base+0(FP), DI // DI := &s
MOVQ t_base+24(FP), SI // SI := &t
MOVQ s_len+8(FP), CX // CX := len(s)
CMPQ t_len+32(FP), CX // CX = max( CX, len(t) )
CMOVQLE t_len+32(FP), CX
PXOR X3, X3 // norm := 0
CMPQ CX, $0 // if CX == 0 { return 0 }
JE l1_end
XORQ AX, AX // i := 0
MOVQ CX, BX
ANDQ $1, BX // BX := CX % 2
SHRQ $1, CX // CX := floor( CX / 2 )
JZ l1_tail_start // if CX == 0 { return 0 }
l1_loop:
MOVUPS (SI)(AX*8), X0
MOVUPS (DI)(AX*8), X1
l1_loop: // Loop unrolled 2x do {
MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
MOVAPS X0, X2
SUBPD X1, X0
SUBPD X2, X1
MAXPD X1, X0
ADDPD X0, X3
ADDQ $2, AX
CMPQ AX, DX
JL l1_loop
JG l1_end
MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
ADDPD X0, X3 // norm += X0
ADDQ $2, AX // i += 2
LOOP l1_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE l1_end
l1_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
PXOR X0, X0 // reset X0, X1 to break dependencies
PXOR X1, X1
l1_tail:
PXOR X0, X0
PXOR X1, X1
MOVSD (SI)(AX*8), X0
MOVSD (DI)(AX*8), X1
MOVSD (SI)(AX*8), X0 // X0 = t[i]
MOVSD (DI)(AX*8), X1 // x1 = s[i]
MOVAPD X0, X2
SUBSD X1, X0
SUBSD X2, X1
MAXSD X1, X0
ADDSD X0, X3
MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
ADDSD X0, X3 // norm += X0
l1_end:
MOVAPS X3, X2
SHUFPD $1, X2, X2
ADDSD X3, X2
MOVSD X2, ret+48(FP)
ADDSD X3, X2 // X2 = X3[1] + X3[0]
MOVSD X2, ret+48(FP) // return X2
RET

View File

@@ -8,45 +8,50 @@
// func LinfNorm(s, t []float64) float64
TEXT ·LinfNorm(SB), NOSPLIT, $0
MOVQ s_base+0(FP), DI
MOVQ t_base+24(FP), SI
MOVQ s_len+8(FP), DX
CMPQ t_len+32(FP), DX
CMOVQLE t_len+32(FP), DX
PXOR X3, X3
XORQ AX, AX
CMPQ DX, $1
JL l1_end
SUBQ $1, DX
JE l1_tail
MOVQ s_base+0(FP), DI // DI := &s
MOVQ t_base+24(FP), SI // SI := &t
MOVQ s_len+8(FP), CX // CX := len(s)
CMPQ t_len+32(FP), CX // CX = max( CX, len(t) )
CMOVQLE t_len+32(FP), CX
PXOR X3, X3 // norm := 0
CMPQ CX, $0 // if CX == 0 { return 0 }
JE l1_end
XORQ AX, AX // i := 0
MOVQ CX, BX
ANDQ $1, BX // BX := CX % 2
SHRQ $1, CX // CX := floor( CX / 2 )
JZ l1_tail_start // if CX == 0 { return 0 }
l1_loop:
MOVUPS (SI)(AX*8), X0
MOVUPS (DI)(AX*8), X1
l1_loop: // Loop unrolled 2x do {
MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
MOVAPS X0, X2
SUBPD X1, X0
SUBPD X2, X1
MAXPD X1, X0
MAXPD X0, X3
ADDQ $2, AX
CMPQ AX, DX
JL l1_loop
JG l1_end
MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
MAXPD X0, X3 // norm = max( norm, X0 )
ADDQ $2, AX // i += 2
LOOP l1_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE l1_end
l1_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
PXOR X0, X0 // reset X0, X1 to break dependencies
PXOR X1, X1
l1_tail:
PXOR X0, X0
PXOR X1, X1
MOVSD (SI)(AX*8), X0
MOVSD (DI)(AX*8), X1
MOVSD (SI)(AX*8), X0 // X0 = t[i]
MOVSD (DI)(AX*8), X1 // X1 = s[i]
MOVAPD X0, X2
SUBSD X1, X0
SUBSD X2, X1
MAXSD X1, X0
MAXSD X0, X3
MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
MAXSD X0, X3 // norm = max( norm, X0 )
l1_end:
MOVAPS X3, X2
SHUFPD $1, X2, X2
MAXSD X3, X2
MOVSD X2, ret+48(FP)
MAXSD X3, X2 // X2 = max( X3[1], X3[0] )
MOVSD X2, ret+48(FP) // return X2
RET

View File

@@ -447,6 +447,11 @@ func TestDiv(t *testing.T) {
src: []float64{1, 2, 3, 4},
expect: []float64{1, 1, 1, 1},
},
{
dst: []float64{1, 2, 3, 4, 2, 4, 6, 8},
src: []float64{1, 2, 3, 4, 1, 2, 3, 4},
expect: []float64{1, 1, 1, 1, 2, 2, 2, 2},
},
{
dst: []float64{2, 4, 6},
src: []float64{1, 2, 3},