mirror of
https://github.com/gonum/gonum.git
synced 2025-10-17 20:51:06 +08:00
asm/f64 Comment assembly code
This commit is contained in:
@@ -8,10 +8,10 @@
|
||||
|
||||
// func AbsSum(x []float64) float64
|
||||
TEXT ·AbsSum(SB), NOSPLIT, $0
|
||||
MOVQ x_base+0(FP), SI
|
||||
MOVQ x_len+8(FP), CX
|
||||
XORQ AX, AX
|
||||
PXOR X0, X0
|
||||
MOVQ x_base+0(FP), SI // SI := &x
|
||||
MOVQ x_len+8(FP), CX // CX := len(x)
|
||||
XORQ AX, AX // i := 0
|
||||
PXOR X0, X0 // p_sum_i := 0
|
||||
PXOR X1, X1
|
||||
PXOR X2, X2
|
||||
PXOR X3, X3
|
||||
@@ -19,59 +19,64 @@ TEXT ·AbsSum(SB), NOSPLIT, $0
|
||||
PXOR X5, X5
|
||||
PXOR X6, X6
|
||||
PXOR X7, X7
|
||||
CMPQ CX, $0
|
||||
CMPQ CX, $0 // if CX == 0 { return 0 }
|
||||
JE absum_end
|
||||
MOVQ CX, BX
|
||||
ANDQ $7, BX
|
||||
SHRQ $3, CX
|
||||
JZ absum_tail_start
|
||||
ANDQ $7, BX // BX := CX % 16
|
||||
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||
JZ absum_tail_start // if CX == 0 { goto absum_tail_start }
|
||||
|
||||
absum_loop:
|
||||
MOVUPS (SI)(AX*8), X8
|
||||
absum_loop: // do {
|
||||
// p_sum += max( p_sum + x[i], p_sum - x[i] )
|
||||
MOVUPS (SI)(AX*8), X8 // X_i = x[i:i+1]
|
||||
MOVUPS 16(SI)(AX*8), X9
|
||||
MOVUPS 32(SI)(AX*8), X10
|
||||
MOVUPS 48(SI)(AX*8), X11
|
||||
ADDPD X8, X0
|
||||
ADDPD X8, X0 // p_sum_i += X_i ( positive values )
|
||||
ADDPD X9, X2
|
||||
ADDPD X10, X4
|
||||
ADDPD X11, X6
|
||||
SUBPD X8, X1
|
||||
SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values )
|
||||
SUBPD X9, X3
|
||||
SUBPD X10, X5
|
||||
SUBPD X11, X7
|
||||
MAXPD X1, X0
|
||||
MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) )
|
||||
MAXPD X3, X2
|
||||
MAXPD X5, X4
|
||||
MAXPD X7, X6
|
||||
MOVAPS X0, X1
|
||||
MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i
|
||||
MOVAPS X2, X3
|
||||
MOVAPS X4, X5
|
||||
MOVAPS X6, X7
|
||||
ADDQ $8, AX
|
||||
LOOP absum_loop
|
||||
ADDPD X3, X0
|
||||
ADDPD X5, X7
|
||||
ADDPD X7, X0
|
||||
ADDQ $8, AX // i += 8
|
||||
LOOP absum_loop // } while --CX > 0
|
||||
|
||||
// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
|
||||
ADDPD X3, X0
|
||||
ADDPD X5, X7
|
||||
ADDPD X7, X0
|
||||
|
||||
// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
|
||||
MOVAPS X0, X1
|
||||
SHUFPD $0x3, X0, X0
|
||||
SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
|
||||
ADDSD X1, X0
|
||||
MOVSD X0, X1
|
||||
CMPQ BX, $0
|
||||
JE absum_end
|
||||
JE absum_end // if BX == 0 { goto absum_end }
|
||||
|
||||
absum_tail_start:
|
||||
MOVQ BX, CX
|
||||
XORPS X8, X8
|
||||
absum_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
XORPS X8, X8 // X_8 = 0
|
||||
|
||||
absum_tail:
|
||||
MOVSD (SI)(AX*8), X8
|
||||
ADDSD X8, X0
|
||||
SUBSD X8, X1
|
||||
MAXSD X1, X0
|
||||
MOVSD X0, X1
|
||||
INCQ AX
|
||||
LOOP absum_tail
|
||||
absum_tail: // do {
|
||||
// p_sum += max( p_sum + x[i], p_sum - x[i] )
|
||||
MOVSD (SI)(AX*8), X8 // X_8 = x[i]
|
||||
MOVSD X0, X1 // p_sum_1 = p_sum_0
|
||||
ADDSD X8, X0 // p_sum_0 += X_8
|
||||
SUBSD X8, X1 // p_sum_1 -= X_8
|
||||
MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 )
|
||||
INCQ AX // i++
|
||||
LOOP absum_tail // } while --CX > 0
|
||||
|
||||
absum_end:
|
||||
MOVSD X1, sum+24(FP)
|
||||
absum_end: // return p_sum_0
|
||||
MOVSD X0, sum+24(FP)
|
||||
RET
|
||||
|
@@ -8,13 +8,13 @@
|
||||
|
||||
// func AbsSumInc(x []float64, n, incX int) (sum float64)
|
||||
TEXT ·AbsSumInc(SB), NOSPLIT, $0
|
||||
MOVQ x_base+0(FP), SI
|
||||
MOVQ n+24(FP), CX
|
||||
MOVQ incX+32(FP), AX
|
||||
MOVQ x_base+0(FP), SI // SI := &x
|
||||
MOVQ n+24(FP), CX // CX := len(x)
|
||||
MOVQ incX+32(FP), AX // AX := increment * sizeof( float64 )
|
||||
SHLQ $3, AX
|
||||
MOVQ AX, DX
|
||||
MOVQ AX, DX // DX := AX * 3
|
||||
IMULQ $3, DX
|
||||
PXOR X0, X0
|
||||
PXOR X0, X0 // p_sum_i := 0
|
||||
PXOR X1, X1
|
||||
PXOR X2, X2
|
||||
PXOR X3, X3
|
||||
@@ -22,64 +22,69 @@ TEXT ·AbsSumInc(SB), NOSPLIT, $0
|
||||
PXOR X5, X5
|
||||
PXOR X6, X6
|
||||
PXOR X7, X7
|
||||
CMPQ CX, $0
|
||||
CMPQ CX, $0 // if CX == 0 { return 0 }
|
||||
JE absum_end
|
||||
MOVQ CX, BX
|
||||
ANDQ $7, BX
|
||||
SHRQ $3, CX
|
||||
JZ absum_tail_start
|
||||
ANDQ $7, BX // BX := CX % 16
|
||||
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||
JZ absum_tail_start // if CX == 0 { goto absum_tail_start }
|
||||
|
||||
absum_loop:
|
||||
MOVSD (SI), X8
|
||||
absum_loop: // do {
|
||||
// p_sum = max( p_sum + x[i], p_sum - x[i] )
|
||||
MOVSD (SI), X8 // X_i[0] = x[i]
|
||||
MOVSD (SI)(AX*1), X9
|
||||
MOVSD (SI)(AX*2), X10
|
||||
MOVSD (SI)(DX*1), X11
|
||||
LEAQ (SI)(AX*4), SI
|
||||
MOVHPD (SI), X8
|
||||
LEAQ (SI)(AX*4), SI // SI = SI + 4
|
||||
MOVHPD (SI), X8 // X_i[1] = x[i+4]
|
||||
MOVHPD (SI)(AX*1), X9
|
||||
MOVHPD (SI)(AX*2), X10
|
||||
MOVHPD (SI)(DX*1), X11
|
||||
ADDPD X8, X0
|
||||
ADDPD X8, X0 // p_sum_i += X_i ( positive values )
|
||||
ADDPD X9, X2
|
||||
ADDPD X10, X4
|
||||
ADDPD X11, X6
|
||||
SUBPD X8, X1
|
||||
SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values )
|
||||
SUBPD X9, X3
|
||||
SUBPD X10, X5
|
||||
SUBPD X11, X7
|
||||
MAXPD X1, X0
|
||||
MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) )
|
||||
MAXPD X3, X2
|
||||
MAXPD X5, X4
|
||||
MAXPD X7, X6
|
||||
MOVAPS X0, X1
|
||||
MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i
|
||||
MOVAPS X2, X3
|
||||
MOVAPS X4, X5
|
||||
MOVAPS X6, X7
|
||||
LEAQ (SI)(AX*4), SI
|
||||
LOOP absum_loop
|
||||
ADDPD X3, X0
|
||||
ADDPD X5, X7
|
||||
ADDPD X7, X0
|
||||
LEAQ (SI)(AX*4), SI // SI = SI + 4
|
||||
LOOP absum_loop // } while --CX > 0
|
||||
|
||||
// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
|
||||
ADDPD X3, X0
|
||||
ADDPD X5, X7
|
||||
ADDPD X7, X0
|
||||
|
||||
// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
|
||||
MOVAPS X0, X1
|
||||
SHUFPD $0x3, X0, X0
|
||||
SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
|
||||
ADDSD X1, X0
|
||||
MOVSD X0, X1
|
||||
CMPQ BX, $0
|
||||
JE absum_end
|
||||
JE absum_end // if BX == 0 { goto absum_end }
|
||||
|
||||
absum_tail_start:
|
||||
MOVQ BX, CX
|
||||
XORPS X8, X8
|
||||
absum_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
XORPS X8, X8 // X_8 = 0
|
||||
|
||||
absum_tail:
|
||||
MOVSD (SI), X8
|
||||
ADDSD X8, X0
|
||||
SUBSD X8, X1
|
||||
MAXSD X1, X0
|
||||
MOVSD X0, X1
|
||||
ADDQ AX, SI
|
||||
LOOP absum_tail
|
||||
absum_tail: // do {
|
||||
// p_sum += max( p_sum + x[i], p_sum - x[i] )
|
||||
MOVSD (SI), X8 // X_8 = x[i]
|
||||
MOVSD X0, X1 // p_sum_1 = p_sum_0
|
||||
ADDSD X8, X0 // p_sum_0 += X_8
|
||||
SUBSD X8, X1 // p_sum_1 -= X_8
|
||||
MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 )
|
||||
ADDQ AX, SI // i++
|
||||
LOOP absum_tail // } while --CX > 0
|
||||
|
||||
absum_end:
|
||||
MOVSD X1, sum+40(FP)
|
||||
absum_end: // return p_sum_0
|
||||
MOVSD X0, sum+40(FP)
|
||||
RET
|
||||
|
@@ -8,59 +8,59 @@
|
||||
|
||||
// func Add(dst, s []float64)
|
||||
TEXT ·Add(SB), NOSPLIT, $0
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ dst_len+8(FP), CX
|
||||
MOVQ s_base+24(FP), SI
|
||||
CMPQ s_len+32(FP), CX
|
||||
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||
MOVQ s_base+24(FP), SI // SI := &s
|
||||
CMPQ s_len+32(FP), CX // CX := max( CX, len(s) )
|
||||
CMOVQLE s_len+32(FP), CX
|
||||
CMPQ CX, $0
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE add_end
|
||||
XORQ AX, AX
|
||||
MOVQ DI, BX
|
||||
ANDQ $0x0F, BX
|
||||
JZ add_no_trim
|
||||
ANDQ $0x0F, BX // BX := &dst & 15
|
||||
JZ add_no_trim // if BX == 0 { goto add_no_trim }
|
||||
|
||||
// Align on 16-bit boundary
|
||||
MOVSD (DI)(AX*8), X0
|
||||
ADDSD (SI)(AX*8), X0
|
||||
MOVSD X0, (DI)(AX*8)
|
||||
INCQ AX
|
||||
DECQ CX
|
||||
JE add_end
|
||||
MOVSD (SI)(AX*8), X0 // X0 = s[i]
|
||||
ADDSD (DI)(AX*8), X0 // X0 += dst[i]
|
||||
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||
INCQ AX // i++
|
||||
DECQ CX // --CX
|
||||
JE add_end // if CX == 0 { return }
|
||||
|
||||
add_no_trim:
|
||||
MOVQ CX, BX
|
||||
ANDQ $7, BX
|
||||
SHRQ $3, CX
|
||||
JZ add_tail_start
|
||||
ANDQ $7, BX // BX := CX % 16
|
||||
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||
JZ add_tail_start // if CX == 0 { goto add_tail_start }
|
||||
|
||||
add_loop: // Loop unrolled 8x
|
||||
MOVUPS (SI)(AX*8), X0
|
||||
add_loop: // Loop unrolled 8x do {
|
||||
MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1]
|
||||
MOVUPS 16(SI)(AX*8), X1
|
||||
MOVUPS 32(SI)(AX*8), X2
|
||||
MOVUPS 48(SI)(AX*8), X3
|
||||
ADDPD (DI)(AX*8), X0
|
||||
ADDPD (DI)(AX*8), X0 // X_i += dst[i:i+1]
|
||||
ADDPD 16(DI)(AX*8), X1
|
||||
ADDPD 32(DI)(AX*8), X2
|
||||
ADDPD 48(DI)(AX*8), X3
|
||||
MOVUPS X0, (DI)(AX*8)
|
||||
MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X_i
|
||||
MOVUPS X1, 16(DI)(AX*8)
|
||||
MOVUPS X2, 32(DI)(AX*8)
|
||||
MOVUPS X3, 48(DI)(AX*8)
|
||||
ADDQ $8, AX
|
||||
LOOP add_loop
|
||||
CMPQ BX, $0
|
||||
ADDQ $8, AX // i += 8
|
||||
LOOP add_loop // } while --CX > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE add_end
|
||||
|
||||
add_tail_start:
|
||||
MOVQ BX, CX
|
||||
add_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
|
||||
add_tail:
|
||||
MOVSD (DI)(AX*8), X0
|
||||
ADDSD (SI)(AX*8), X0
|
||||
MOVSD X0, (DI)(AX*8)
|
||||
INCQ AX
|
||||
LOOP add_tail
|
||||
add_tail: // do {
|
||||
MOVSD (SI)(AX*8), X0 // X0 = s[i]
|
||||
ADDSD (DI)(AX*8), X0 // X0 += dst[i]
|
||||
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||
INCQ AX // ++i
|
||||
LOOP add_tail // } while --CX > 0
|
||||
|
||||
add_end:
|
||||
RET
|
||||
|
@@ -8,46 +8,46 @@
|
||||
|
||||
// func Addconst(alpha float64, x []float64)
|
||||
TEXT ·AddConst(SB), NOSPLIT, $0
|
||||
MOVQ x_base+8(FP), SI
|
||||
MOVQ x_len+16(FP), CX
|
||||
CMPQ CX, $0
|
||||
MOVQ x_base+8(FP), SI // SI := &x
|
||||
MOVQ x_len+16(FP), CX // CX := len(x)
|
||||
CMPQ CX, $0 // if len(x) == 0 { return }
|
||||
JE ac_end
|
||||
MOVSD alpha+0(FP), X4
|
||||
MOVSD alpha+0(FP), X4 // X4 = { a, a }
|
||||
SHUFPD $0, X4, X4
|
||||
MOVUPS X4, X5
|
||||
XORQ AX, AX
|
||||
MOVUPS X4, X5 // X5 = X4
|
||||
XORQ AX, AX // i = 0
|
||||
MOVQ CX, BX
|
||||
ANDQ $7, BX
|
||||
SHRQ $3, CX
|
||||
JZ ac_tail_start
|
||||
ANDQ $7, BX // BX := len(x) % 16
|
||||
SHRQ $3, CX // CX := floor( CX / 16 )
|
||||
JZ ac_tail_start // if CX == 0 { goto ac_tail_start }
|
||||
|
||||
ac_loop:
|
||||
MOVUPS (SI)(AX*8), X0
|
||||
ac_loop: // Loop unrolled 8x do {
|
||||
MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1]
|
||||
MOVUPS 16(SI)(AX*8), X1
|
||||
MOVUPS 32(SI)(AX*8), X2
|
||||
MOVUPS 48(SI)(AX*8), X3
|
||||
ADDPD X4, X0
|
||||
ADDPD X4, X0 // X_i += a
|
||||
ADDPD X5, X1
|
||||
ADDPD X4, X2
|
||||
ADDPD X5, X3
|
||||
MOVUPS X0, (SI)(AX*8)
|
||||
MOVUPS X0, (SI)(AX*8) // s[i:i+1] = X_i
|
||||
MOVUPS X1, 16(SI)(AX*8)
|
||||
MOVUPS X2, 32(SI)(AX*8)
|
||||
MOVUPS X3, 48(SI)(AX*8)
|
||||
ADDQ $8, AX
|
||||
LOOP ac_loop
|
||||
CMPQ BX, $0
|
||||
ADDQ $8, AX // i += 8
|
||||
LOOP ac_loop // } while --CX > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE ac_end
|
||||
|
||||
ac_tail_start:
|
||||
MOVQ BX, CX
|
||||
ac_tail_start: // Reset loop counters
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
|
||||
ac_tail:
|
||||
MOVSD (SI)(AX*8), X0
|
||||
ADDSD X4, X0
|
||||
MOVSD X0, (SI)(AX*8)
|
||||
INCQ AX
|
||||
LOOP ac_tail
|
||||
ac_tail: // do {
|
||||
MOVSD (SI)(AX*8), X0 // X0 = s[i]
|
||||
ADDSD X4, X0 // X0 += a
|
||||
MOVSD X0, (SI)(AX*8) // s[i] = X0
|
||||
INCQ AX // ++i
|
||||
LOOP ac_tail // } while --CX > 0
|
||||
|
||||
ac_end:
|
||||
RET
|
||||
|
@@ -7,63 +7,65 @@
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·CumProd(SB), NOSPLIT, $0
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ dst_len+8(FP), CX
|
||||
MOVQ s_base+24(FP), SI
|
||||
CMPQ s_len+32(FP), CX
|
||||
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||
MOVQ s_base+24(FP), SI // SI := &s
|
||||
CMPQ s_len+32(FP), CX // CX := max( CX, len(s) )
|
||||
CMOVQLE s_len+32(FP), CX
|
||||
MOVQ CX, ret_len+56(FP)
|
||||
CMPQ CX, $0
|
||||
JE cs_end
|
||||
XORQ AX, AX
|
||||
MOVQ CX, ret_len+56(FP) // len(ret) = CX
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE cp_end
|
||||
XORQ AX, AX // i := 0
|
||||
|
||||
MOVSD (SI), X5
|
||||
MOVSD (SI), X5 // p_prod = { s[0], s[0] }
|
||||
SHUFPD $0, X5, X5
|
||||
MOVSD X5, (DI)
|
||||
INCQ AX
|
||||
DECQ CX
|
||||
JZ cs_end
|
||||
MOVSD X5, (DI) // dst[0] = s[0]
|
||||
INCQ AX // ++i
|
||||
DECQ CX // -- CX
|
||||
JZ cp_end // if CX == 0 { return }
|
||||
|
||||
MOVQ CX, BX
|
||||
ANDQ $3, BX
|
||||
SHRQ $2, CX
|
||||
JZ cs_tail_start
|
||||
ANDQ $3, BX // BX := CX % 4
|
||||
SHRQ $2, CX // CX = floor( CX / 4 )
|
||||
JZ cp_tail_start // if CX == 0 { goto cp_tail_start }
|
||||
|
||||
cs_loop:
|
||||
MOVUPS (SI)(AX*8), X0
|
||||
cp_loop: // Loop unrolled 4x do {
|
||||
MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1]
|
||||
MOVUPS 16(SI)(AX*8), X2
|
||||
MOVAPS X0, X1
|
||||
MOVAPS X0, X1 // X1 = X0
|
||||
MOVAPS X2, X3
|
||||
SHUFPD $1, X1, X1
|
||||
SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] }
|
||||
SHUFPD $1, X3, X3
|
||||
MULPD X0, X1
|
||||
MULPD X0, X1 // X1 *= X0
|
||||
MULPD X2, X3
|
||||
SHUFPD $2, X1, X0
|
||||
SHUFPD $3, X1, X1
|
||||
SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] }
|
||||
SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] }
|
||||
SHUFPD $2, X3, X2
|
||||
SHUFPD $3, X3, X3
|
||||
MULPD X5, X0
|
||||
MULPD X1, X5
|
||||
MULPD X5, X0 // X0 *= p_prod
|
||||
MULPD X1, X5 // p_prod *= X1
|
||||
MULPD X5, X2
|
||||
MOVUPS X0, (DI)(AX*8)
|
||||
MOVUPS X0, (DI)(AX*8) // dst[i] = X0
|
||||
MOVUPS X2, 16(DI)(AX*8)
|
||||
MULPD X3, X5
|
||||
ADDQ $4, AX
|
||||
LOOP cs_loop
|
||||
CMPQ BX, $0
|
||||
JE cs_end
|
||||
ADDQ $4, AX // i += 4
|
||||
LOOP cp_loop // } while --CX > 0
|
||||
|
||||
cs_tail_start:
|
||||
MOVQ BX, CX
|
||||
// if BX == 0 { return }
|
||||
CMPQ BX, $0
|
||||
JE cp_end
|
||||
|
||||
cs_tail:
|
||||
MULSD (SI)(AX*8), X5
|
||||
MOVSD X5, (DI)(AX*8)
|
||||
INCQ AX
|
||||
LOOP cs_tail
|
||||
cp_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
|
||||
cs_end:
|
||||
MOVQ DI, ret_base+48(FP)
|
||||
MOVQ dst_cap+16(FP), SI
|
||||
cp_tail: // do {
|
||||
MULSD (SI)(AX*8), X5 // p_prod *= s[i]
|
||||
MOVSD X5, (DI)(AX*8) // dst[i] = p_prod
|
||||
INCQ AX // ++i
|
||||
LOOP cp_tail // } while --CX > 0
|
||||
|
||||
cp_end:
|
||||
MOVQ DI, ret_base+48(FP) // &ret = &dst
|
||||
MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst)
|
||||
MOVQ SI, ret_cap+64(FP)
|
||||
RET
|
||||
|
@@ -7,56 +7,58 @@
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·CumSum(SB), NOSPLIT, $0
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ dst_len+8(FP), CX
|
||||
MOVQ s_base+24(FP), SI
|
||||
CMPQ s_len+32(FP), CX
|
||||
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||
MOVQ s_base+24(FP), SI // SI := &s
|
||||
CMPQ s_len+32(FP), CX // CX := max( CX, len(s) )
|
||||
CMOVQLE s_len+32(FP), CX
|
||||
MOVQ CX, ret_len+56(FP)
|
||||
CMPQ CX, $0
|
||||
MOVQ CX, ret_len+56(FP) // len(ret) = CX
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE cs_end
|
||||
XORQ AX, AX
|
||||
PXOR X5, X5
|
||||
XORQ AX, AX // i := 0
|
||||
PXOR X5, X5 // p_sum = 0
|
||||
MOVQ CX, BX
|
||||
ANDQ $3, BX
|
||||
SHRQ $2, CX
|
||||
JZ cs_tail_start
|
||||
ANDQ $3, BX // BX := CX % 4
|
||||
SHRQ $2, CX // CX = floor( CX / 4 )
|
||||
JZ cs_tail_start // if CX == 0 { goto cs_tail_start }
|
||||
|
||||
cs_loop:
|
||||
MOVUPS (SI)(AX*8), X0
|
||||
cs_loop: // Loop unrolled 4x do {
|
||||
MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1]
|
||||
MOVUPS 16(SI)(AX*8), X2
|
||||
MOVAPS X0, X1
|
||||
MOVAPS X0, X1 // X1 = X0
|
||||
MOVAPS X2, X3
|
||||
SHUFPD $1, X1, X1
|
||||
SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] }
|
||||
SHUFPD $1, X3, X3
|
||||
ADDPD X0, X1
|
||||
ADDPD X0, X1 // X1 += X0
|
||||
ADDPD X2, X3
|
||||
SHUFPD $2, X1, X0
|
||||
SHUFPD $3, X1, X1
|
||||
SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] }
|
||||
SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] }
|
||||
SHUFPD $2, X3, X2
|
||||
SHUFPD $3, X3, X3
|
||||
ADDPD X5, X0
|
||||
ADDPD X1, X5
|
||||
ADDPD X5, X0 // X0 += p_sum
|
||||
ADDPD X1, X5 // p_sum += X1
|
||||
ADDPD X5, X2
|
||||
MOVUPS X0, (DI)(AX*8)
|
||||
MOVUPS X0, (DI)(AX*8) // dst[i] = X0
|
||||
MOVUPS X2, 16(DI)(AX*8)
|
||||
ADDPD X3, X5
|
||||
ADDQ $4, AX
|
||||
LOOP cs_loop
|
||||
CMPQ BX, $0
|
||||
JE cs_end
|
||||
ADDQ $4, AX // i += 4
|
||||
LOOP cs_loop // } while --CX > 0
|
||||
|
||||
cs_tail_start:
|
||||
MOVQ BX, CX
|
||||
// if BX == 0 { return }
|
||||
CMPQ BX, $0
|
||||
JE cs_end
|
||||
|
||||
cs_tail:
|
||||
ADDSD (SI)(AX*8), X5
|
||||
MOVSD X5, (DI)(AX*8)
|
||||
INCQ AX
|
||||
LOOP cs_tail
|
||||
cs_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
|
||||
cs_tail: // do {
|
||||
ADDSD (SI)(AX*8), X5 // p_sum *= s[i]
|
||||
MOVSD X5, (DI)(AX*8) // dst[i] = p_sum
|
||||
INCQ AX // ++i
|
||||
LOOP cs_tail // } while --CX > 0
|
||||
|
||||
cs_end:
|
||||
MOVQ DI, ret_base+48(FP)
|
||||
MOVQ dst_cap+16(FP), SI
|
||||
MOVQ DI, ret_base+48(FP) // &ret = &dst
|
||||
MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst)
|
||||
MOVQ SI, ret_cap+64(FP)
|
||||
RET
|
||||
|
@@ -8,59 +8,59 @@
|
||||
|
||||
// func Div(dst, s []float64)
|
||||
TEXT ·Div(SB), NOSPLIT, $0
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ dst_len+8(FP), CX
|
||||
MOVQ s_base+24(FP), SI
|
||||
CMPQ s_len+32(FP), CX
|
||||
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||
MOVQ s_base+24(FP), SI // SI := &s
|
||||
CMPQ s_len+32(FP), CX // CX = max( CX, len(s) )
|
||||
CMOVQLE s_len+32(FP), CX
|
||||
CMPQ CX, $0
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE div_end
|
||||
XORQ AX, AX
|
||||
XORQ AX, AX // i := 0
|
||||
MOVQ SI, BX
|
||||
ANDQ $15, BX
|
||||
JZ div_no_trim
|
||||
ANDQ $15, BX // BX := &s & 15
|
||||
JZ div_no_trim // if BX == 0 { goto div_no_trim }
|
||||
|
||||
// Align on 16-bit boundary
|
||||
MOVSD (DI)(AX*8), X0
|
||||
DIVSD (SI)(AX*8), X0
|
||||
MOVSD X0, (DI)(AX*8)
|
||||
INCQ AX
|
||||
DECQ CX
|
||||
JZ div_end
|
||||
MOVSD (DI)(AX*8), X0 // X0 := dst[i]
|
||||
DIVSD (SI)(AX*8), X0 // X0 /= s[i]
|
||||
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||
INCQ AX // ++i
|
||||
DECQ CX // --CX
|
||||
JZ div_end // if CX == 0 { return }
|
||||
|
||||
div_no_trim:
|
||||
MOVQ CX, BX
|
||||
ANDQ $7, BX
|
||||
SHRQ $3, CX
|
||||
JZ div_tail_start
|
||||
ANDQ $7, BX // BX = CX % 16
|
||||
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||
JZ div_tail_start // if CX == 0 { goto div_tail_start }
|
||||
|
||||
div_loop: // Loop unrolled 8x
|
||||
MOVUPS (SI)(AX*8), X0
|
||||
MOVUPS 16(SI)(AX*8), X1
|
||||
MOVUPS 32(SI)(AX*8), X2
|
||||
MOVUPS 48(SI)(AX*8), X3
|
||||
DIVPD (DI)(AX*8), X0
|
||||
DIVPD 16(DI)(AX*8), X1
|
||||
DIVPD 32(DI)(AX*8), X2
|
||||
DIVPD 48(DI)(AX*8), X3
|
||||
MOVUPS X0, (DI)(AX*8)
|
||||
div_loop: // Loop unrolled 8x do {
|
||||
MOVUPS (DI)(AX*8), X0 // X0 := dst[i:i+1]
|
||||
MOVUPS 16(DI)(AX*8), X1
|
||||
MOVUPS 32(DI)(AX*8), X2
|
||||
MOVUPS 48(DI)(AX*8), X3
|
||||
DIVPD (SI)(AX*8), X0 // X0 /= s[i:i+1]
|
||||
DIVPD 16(SI)(AX*8), X1
|
||||
DIVPD 32(SI)(AX*8), X2
|
||||
DIVPD 48(SI)(AX*8), X3
|
||||
MOVUPS X0, (DI)(AX*8) // dst[i] = X0
|
||||
MOVUPS X1, 16(DI)(AX*8)
|
||||
MOVUPS X2, 32(DI)(AX*8)
|
||||
MOVUPS X3, 48(DI)(AX*8)
|
||||
ADDQ $4, AX
|
||||
LOOP div_loop
|
||||
CMPQ BX, $0
|
||||
ADDQ $8, AX // i += 8
|
||||
LOOP div_loop // } while --CX > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE div_end
|
||||
|
||||
div_tail_start:
|
||||
MOVQ BX, CX
|
||||
div_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
|
||||
div_tail:
|
||||
MOVSD (DI)(AX*8), X0
|
||||
DIVSD (SI)(AX*8), X0
|
||||
MOVSD X0, (DI)(AX*8)
|
||||
INCQ AX
|
||||
LOOP div_tail
|
||||
div_tail: // do {
|
||||
MOVSD (DI)(AX*8), X0 // X0 = dst[i]
|
||||
DIVSD (SI)(AX*8), X0 // X0 /= s[i]
|
||||
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||
INCQ AX // ++i
|
||||
LOOP div_tail // } while --CX > 0
|
||||
|
||||
div_end:
|
||||
RET
|
||||
|
@@ -8,66 +8,66 @@
|
||||
|
||||
// func DivTo(dst, x, y []float64)
|
||||
TEXT ·DivTo(SB), NOSPLIT, $0
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ dst_len+8(FP), CX
|
||||
MOVQ x_base+24(FP), SI
|
||||
MOVQ y_base+48(FP), DX
|
||||
CMPQ x_len+32(FP), CX
|
||||
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||
MOVQ x_base+24(FP), SI // SI := &x
|
||||
MOVQ y_base+48(FP), DX // DX := &y
|
||||
CMPQ x_len+32(FP), CX // CX = max( len(dst), len(x), len(y) )
|
||||
CMOVQLE x_len+32(FP), CX
|
||||
CMPQ y_len+56(FP), CX
|
||||
CMOVQLE y_len+56(FP), CX
|
||||
MOVQ CX, ret_len+80(FP)
|
||||
CMPQ CX, $0
|
||||
MOVQ CX, ret_len+80(FP) // len(ret) = CX
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE div_end
|
||||
XORQ AX, AX
|
||||
XORQ AX, AX // i := 0
|
||||
MOVQ DI, BX
|
||||
ANDQ $15, BX
|
||||
JZ div_no_trim
|
||||
ANDQ $15, BX // BX := &dst & OxF
|
||||
JZ div_no_trim // if BX == 0 { goto div_no_trim }
|
||||
|
||||
// Align on 16-bit boundary
|
||||
MOVSD (SI)(AX*8), X0
|
||||
DIVSD (DX)(AX*8), X0
|
||||
MOVSD X0, (DI)(AX*8)
|
||||
INCQ AX
|
||||
DECQ CX
|
||||
JZ div_end // */
|
||||
MOVSD (SI)(AX*8), X0 // X0 := s[i]
|
||||
DIVSD (DX)(AX*8), X0 // X0 /= t[i]
|
||||
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||
INCQ AX // ++i
|
||||
DECQ CX // --CX
|
||||
JZ div_end // if CX == 0 { return }
|
||||
|
||||
div_no_trim:
|
||||
MOVQ CX, BX
|
||||
ANDQ $7, BX
|
||||
SHRQ $3, CX
|
||||
JZ div_tail_start
|
||||
ANDQ $7, BX // BX = CX % 16
|
||||
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||
JZ div_tail_start // if CX == 0 { goto div_tail_start }
|
||||
|
||||
div_loop: // Unroll 8x
|
||||
MOVUPS (SI)(AX*8), X0
|
||||
div_loop: // Loop unrolled 8x do {
|
||||
MOVUPS (SI)(AX*8), X0 // X0 := x[i:i+1]
|
||||
MOVUPS 16(SI)(AX*8), X1
|
||||
MOVUPS 32(SI)(AX*8), X2
|
||||
MOVUPS 48(SI)(AX*8), X3
|
||||
DIVPD (DX)(AX*8), X0
|
||||
DIVPD (DX)(AX*8), X0 // X0 /= y[i:i+1]
|
||||
DIVPD 16(DX)(AX*8), X1
|
||||
DIVPD 32(DX)(AX*8), X2
|
||||
DIVPD 48(DX)(AX*8), X3
|
||||
MOVUPS X0, (DI)(AX*8)
|
||||
MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X0
|
||||
MOVUPS X1, 16(DI)(AX*8)
|
||||
MOVUPS X2, 32(DI)(AX*8)
|
||||
MOVUPS X3, 48(DI)(AX*8)
|
||||
ADDQ $8, AX
|
||||
LOOP div_loop
|
||||
CMPQ CX, $0
|
||||
ADDQ $8, AX // i += 8
|
||||
LOOP div_loop // } while --CX > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE div_end
|
||||
|
||||
div_tail_start:
|
||||
MOVQ BX, CX
|
||||
div_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
|
||||
div_tail:
|
||||
MOVSD (SI)(AX*8), X0
|
||||
DIVSD (DX)(AX*8), X0
|
||||
div_tail: // do {
|
||||
MOVSD (SI)(AX*8), X0 // X0 = x[i]
|
||||
DIVSD (DX)(AX*8), X0 // X0 /= y[i]
|
||||
MOVSD X0, (DI)(AX*8)
|
||||
INCQ AX
|
||||
LOOP div_tail
|
||||
INCQ AX // ++i
|
||||
LOOP div_tail // } while --CX > 0
|
||||
|
||||
div_end:
|
||||
MOVQ DI, ret_base+72(FP)
|
||||
MOVQ dst_cap+16(FP), DI
|
||||
MOVQ DI, ret_base+72(FP) // &ret = &dst
|
||||
MOVQ dst_cap+16(FP), DI // cap(ret) = cap(dst)
|
||||
MOVQ DI, ret_cap+88(FP)
|
||||
RET
|
||||
|
@@ -8,46 +8,51 @@
|
||||
|
||||
// func L1Norm(s, t []float64) float64
|
||||
TEXT ·L1Norm(SB), NOSPLIT, $0
|
||||
MOVQ s_base+0(FP), DI
|
||||
MOVQ t_base+24(FP), SI
|
||||
MOVQ s_len+8(FP), DX
|
||||
CMPQ t_len+32(FP), DX
|
||||
CMOVQLE t_len+32(FP), DX
|
||||
PXOR X3, X3
|
||||
XORQ AX, AX
|
||||
CMPQ DX, $1
|
||||
JL l1_end
|
||||
SUBQ $1, DX
|
||||
JE l1_tail
|
||||
MOVQ s_base+0(FP), DI // DI := &s
|
||||
MOVQ t_base+24(FP), SI // SI := &t
|
||||
MOVQ s_len+8(FP), CX // CX := len(s)
|
||||
CMPQ t_len+32(FP), CX // CX = max( CX, len(t) )
|
||||
CMOVQLE t_len+32(FP), CX
|
||||
PXOR X3, X3 // norm := 0
|
||||
CMPQ CX, $0 // if CX == 0 { return 0 }
|
||||
JE l1_end
|
||||
XORQ AX, AX // i := 0
|
||||
MOVQ CX, BX
|
||||
ANDQ $1, BX // BX := CX % 2
|
||||
SHRQ $1, CX // CX := floor( CX / 2 )
|
||||
JZ l1_tail_start // if CX == 0 { return 0 }
|
||||
|
||||
l1_loop:
|
||||
MOVUPS (SI)(AX*8), X0
|
||||
MOVUPS (DI)(AX*8), X1
|
||||
l1_loop: // Loop unrolled 2x do {
|
||||
MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
|
||||
MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
|
||||
MOVAPS X0, X2
|
||||
SUBPD X1, X0
|
||||
SUBPD X2, X1
|
||||
MAXPD X1, X0
|
||||
ADDPD X0, X3
|
||||
ADDQ $2, AX
|
||||
CMPQ AX, DX
|
||||
JL l1_loop
|
||||
JG l1_end
|
||||
MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
|
||||
ADDPD X0, X3 // norm += X0
|
||||
ADDQ $2, AX // i += 2
|
||||
LOOP l1_loop // } while --CX > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE l1_end
|
||||
|
||||
l1_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
PXOR X0, X0 // reset X0, X1 to break dependencies
|
||||
PXOR X1, X1
|
||||
|
||||
l1_tail:
|
||||
PXOR X0, X0
|
||||
PXOR X1, X1
|
||||
MOVSD (SI)(AX*8), X0
|
||||
MOVSD (DI)(AX*8), X1
|
||||
MOVSD (SI)(AX*8), X0 // X0 = t[i]
|
||||
MOVSD (DI)(AX*8), X1 // x1 = s[i]
|
||||
MOVAPD X0, X2
|
||||
SUBSD X1, X0
|
||||
SUBSD X2, X1
|
||||
MAXSD X1, X0
|
||||
ADDSD X0, X3
|
||||
MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
|
||||
ADDSD X0, X3 // norm += X0
|
||||
|
||||
l1_end:
|
||||
MOVAPS X3, X2
|
||||
SHUFPD $1, X2, X2
|
||||
ADDSD X3, X2
|
||||
MOVSD X2, ret+48(FP)
|
||||
ADDSD X3, X2 // X2 = X3[1] + X3[0]
|
||||
MOVSD X2, ret+48(FP) // return X2
|
||||
RET
|
||||
|
||||
|
@@ -8,45 +8,50 @@
|
||||
|
||||
// func LinfNorm(s, t []float64) float64
|
||||
TEXT ·LinfNorm(SB), NOSPLIT, $0
|
||||
MOVQ s_base+0(FP), DI
|
||||
MOVQ t_base+24(FP), SI
|
||||
MOVQ s_len+8(FP), DX
|
||||
CMPQ t_len+32(FP), DX
|
||||
CMOVQLE t_len+32(FP), DX
|
||||
PXOR X3, X3
|
||||
XORQ AX, AX
|
||||
CMPQ DX, $1
|
||||
JL l1_end
|
||||
SUBQ $1, DX
|
||||
JE l1_tail
|
||||
MOVQ s_base+0(FP), DI // DI := &s
|
||||
MOVQ t_base+24(FP), SI // SI := &t
|
||||
MOVQ s_len+8(FP), CX // CX := len(s)
|
||||
CMPQ t_len+32(FP), CX // CX = max( CX, len(t) )
|
||||
CMOVQLE t_len+32(FP), CX
|
||||
PXOR X3, X3 // norm := 0
|
||||
CMPQ CX, $0 // if CX == 0 { return 0 }
|
||||
JE l1_end
|
||||
XORQ AX, AX // i := 0
|
||||
MOVQ CX, BX
|
||||
ANDQ $1, BX // BX := CX % 2
|
||||
SHRQ $1, CX // CX := floor( CX / 2 )
|
||||
JZ l1_tail_start // if CX == 0 { return 0 }
|
||||
|
||||
l1_loop:
|
||||
MOVUPS (SI)(AX*8), X0
|
||||
MOVUPS (DI)(AX*8), X1
|
||||
l1_loop: // Loop unrolled 2x do {
|
||||
MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
|
||||
MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
|
||||
MOVAPS X0, X2
|
||||
SUBPD X1, X0
|
||||
SUBPD X2, X1
|
||||
MAXPD X1, X0
|
||||
MAXPD X0, X3
|
||||
ADDQ $2, AX
|
||||
CMPQ AX, DX
|
||||
JL l1_loop
|
||||
JG l1_end
|
||||
MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
|
||||
MAXPD X0, X3 // norm = max( norm, X0 )
|
||||
ADDQ $2, AX // i += 2
|
||||
LOOP l1_loop // } while --CX > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE l1_end
|
||||
|
||||
l1_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
PXOR X0, X0 // reset X0, X1 to break dependencies
|
||||
PXOR X1, X1
|
||||
|
||||
l1_tail:
|
||||
PXOR X0, X0
|
||||
PXOR X1, X1
|
||||
MOVSD (SI)(AX*8), X0
|
||||
MOVSD (DI)(AX*8), X1
|
||||
MOVSD (SI)(AX*8), X0 // X0 = t[i]
|
||||
MOVSD (DI)(AX*8), X1 // X1 = s[i]
|
||||
MOVAPD X0, X2
|
||||
SUBSD X1, X0
|
||||
SUBSD X2, X1
|
||||
MAXSD X1, X0
|
||||
MAXSD X0, X3
|
||||
MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
|
||||
MAXSD X0, X3 // norm = max( norm, X0 )
|
||||
|
||||
l1_end:
|
||||
MOVAPS X3, X2
|
||||
SHUFPD $1, X2, X2
|
||||
MAXSD X3, X2
|
||||
MOVSD X2, ret+48(FP)
|
||||
MAXSD X3, X2 // X2 = max( X3[1], X3[0] )
|
||||
MOVSD X2, ret+48(FP) // return X2
|
||||
RET
|
||||
|
@@ -447,6 +447,11 @@ func TestDiv(t *testing.T) {
|
||||
src: []float64{1, 2, 3, 4},
|
||||
expect: []float64{1, 1, 1, 1},
|
||||
},
|
||||
{
|
||||
dst: []float64{1, 2, 3, 4, 2, 4, 6, 8},
|
||||
src: []float64{1, 2, 3, 4, 1, 2, 3, 4},
|
||||
expect: []float64{1, 1, 1, 1, 2, 2, 2, 2},
|
||||
},
|
||||
{
|
||||
dst: []float64{2, 4, 6},
|
||||
src: []float64{1, 2, 3},
|
||||
|
Reference in New Issue
Block a user