mirror of
https://github.com/gonum/gonum.git
synced 2025-10-19 13:35:51 +08:00
asm/f64 Comment assembly code
This commit is contained in:
@@ -8,10 +8,10 @@
|
|||||||
|
|
||||||
// func AbsSum(x []float64) float64
|
// func AbsSum(x []float64) float64
|
||||||
TEXT ·AbsSum(SB), NOSPLIT, $0
|
TEXT ·AbsSum(SB), NOSPLIT, $0
|
||||||
MOVQ x_base+0(FP), SI
|
MOVQ x_base+0(FP), SI // SI := &x
|
||||||
MOVQ x_len+8(FP), CX
|
MOVQ x_len+8(FP), CX // CX := len(x)
|
||||||
XORQ AX, AX
|
XORQ AX, AX // i := 0
|
||||||
PXOR X0, X0
|
PXOR X0, X0 // p_sum_i := 0
|
||||||
PXOR X1, X1
|
PXOR X1, X1
|
||||||
PXOR X2, X2
|
PXOR X2, X2
|
||||||
PXOR X3, X3
|
PXOR X3, X3
|
||||||
@@ -19,59 +19,64 @@ TEXT ·AbsSum(SB), NOSPLIT, $0
|
|||||||
PXOR X5, X5
|
PXOR X5, X5
|
||||||
PXOR X6, X6
|
PXOR X6, X6
|
||||||
PXOR X7, X7
|
PXOR X7, X7
|
||||||
CMPQ CX, $0
|
CMPQ CX, $0 // if CX == 0 { return 0 }
|
||||||
JE absum_end
|
JE absum_end
|
||||||
MOVQ CX, BX
|
MOVQ CX, BX
|
||||||
ANDQ $7, BX
|
ANDQ $7, BX // BX := CX % 16
|
||||||
SHRQ $3, CX
|
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||||
JZ absum_tail_start
|
JZ absum_tail_start // if CX == 0 { goto absum_tail_start }
|
||||||
|
|
||||||
absum_loop:
|
absum_loop: // do {
|
||||||
MOVUPS (SI)(AX*8), X8
|
// p_sum += max( p_sum + x[i], p_sum - x[i] )
|
||||||
|
MOVUPS (SI)(AX*8), X8 // X_i = x[i:i+1]
|
||||||
MOVUPS 16(SI)(AX*8), X9
|
MOVUPS 16(SI)(AX*8), X9
|
||||||
MOVUPS 32(SI)(AX*8), X10
|
MOVUPS 32(SI)(AX*8), X10
|
||||||
MOVUPS 48(SI)(AX*8), X11
|
MOVUPS 48(SI)(AX*8), X11
|
||||||
ADDPD X8, X0
|
ADDPD X8, X0 // p_sum_i += X_i ( positive values )
|
||||||
ADDPD X9, X2
|
ADDPD X9, X2
|
||||||
ADDPD X10, X4
|
ADDPD X10, X4
|
||||||
ADDPD X11, X6
|
ADDPD X11, X6
|
||||||
SUBPD X8, X1
|
SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values )
|
||||||
SUBPD X9, X3
|
SUBPD X9, X3
|
||||||
SUBPD X10, X5
|
SUBPD X10, X5
|
||||||
SUBPD X11, X7
|
SUBPD X11, X7
|
||||||
MAXPD X1, X0
|
MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) )
|
||||||
MAXPD X3, X2
|
MAXPD X3, X2
|
||||||
MAXPD X5, X4
|
MAXPD X5, X4
|
||||||
MAXPD X7, X6
|
MAXPD X7, X6
|
||||||
MOVAPS X0, X1
|
MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i
|
||||||
MOVAPS X2, X3
|
MOVAPS X2, X3
|
||||||
MOVAPS X4, X5
|
MOVAPS X4, X5
|
||||||
MOVAPS X6, X7
|
MOVAPS X6, X7
|
||||||
ADDQ $8, AX
|
ADDQ $8, AX // i += 8
|
||||||
LOOP absum_loop
|
LOOP absum_loop // } while --CX > 0
|
||||||
ADDPD X3, X0
|
|
||||||
ADDPD X5, X7
|
// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
|
||||||
ADDPD X7, X0
|
ADDPD X3, X0
|
||||||
|
ADDPD X5, X7
|
||||||
|
ADDPD X7, X0
|
||||||
|
|
||||||
|
// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
|
||||||
MOVAPS X0, X1
|
MOVAPS X0, X1
|
||||||
SHUFPD $0x3, X0, X0
|
SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
|
||||||
ADDSD X1, X0
|
ADDSD X1, X0
|
||||||
MOVSD X0, X1
|
|
||||||
CMPQ BX, $0
|
CMPQ BX, $0
|
||||||
JE absum_end
|
JE absum_end // if BX == 0 { goto absum_end }
|
||||||
|
|
||||||
absum_tail_start:
|
absum_tail_start: // Reset loop registers
|
||||||
MOVQ BX, CX
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
XORPS X8, X8
|
XORPS X8, X8 // X_8 = 0
|
||||||
|
|
||||||
absum_tail:
|
absum_tail: // do {
|
||||||
MOVSD (SI)(AX*8), X8
|
// p_sum += max( p_sum + x[i], p_sum - x[i] )
|
||||||
ADDSD X8, X0
|
MOVSD (SI)(AX*8), X8 // X_8 = x[i]
|
||||||
SUBSD X8, X1
|
MOVSD X0, X1 // p_sum_1 = p_sum_0
|
||||||
MAXSD X1, X0
|
ADDSD X8, X0 // p_sum_0 += X_8
|
||||||
MOVSD X0, X1
|
SUBSD X8, X1 // p_sum_1 -= X_8
|
||||||
INCQ AX
|
MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 )
|
||||||
LOOP absum_tail
|
INCQ AX // i++
|
||||||
|
LOOP absum_tail // } while --CX > 0
|
||||||
|
|
||||||
absum_end:
|
absum_end: // return p_sum_0
|
||||||
MOVSD X1, sum+24(FP)
|
MOVSD X0, sum+24(FP)
|
||||||
RET
|
RET
|
||||||
|
@@ -8,13 +8,13 @@
|
|||||||
|
|
||||||
// func AbsSumInc(x []float64, n, incX int) (sum float64)
|
// func AbsSumInc(x []float64, n, incX int) (sum float64)
|
||||||
TEXT ·AbsSumInc(SB), NOSPLIT, $0
|
TEXT ·AbsSumInc(SB), NOSPLIT, $0
|
||||||
MOVQ x_base+0(FP), SI
|
MOVQ x_base+0(FP), SI // SI := &x
|
||||||
MOVQ n+24(FP), CX
|
MOVQ n+24(FP), CX // CX := len(x)
|
||||||
MOVQ incX+32(FP), AX
|
MOVQ incX+32(FP), AX // AX := increment * sizeof( float64 )
|
||||||
SHLQ $3, AX
|
SHLQ $3, AX
|
||||||
MOVQ AX, DX
|
MOVQ AX, DX // DX := AX * 3
|
||||||
IMULQ $3, DX
|
IMULQ $3, DX
|
||||||
PXOR X0, X0
|
PXOR X0, X0 // p_sum_i := 0
|
||||||
PXOR X1, X1
|
PXOR X1, X1
|
||||||
PXOR X2, X2
|
PXOR X2, X2
|
||||||
PXOR X3, X3
|
PXOR X3, X3
|
||||||
@@ -22,64 +22,69 @@ TEXT ·AbsSumInc(SB), NOSPLIT, $0
|
|||||||
PXOR X5, X5
|
PXOR X5, X5
|
||||||
PXOR X6, X6
|
PXOR X6, X6
|
||||||
PXOR X7, X7
|
PXOR X7, X7
|
||||||
CMPQ CX, $0
|
CMPQ CX, $0 // if CX == 0 { return 0 }
|
||||||
JE absum_end
|
JE absum_end
|
||||||
MOVQ CX, BX
|
MOVQ CX, BX
|
||||||
ANDQ $7, BX
|
ANDQ $7, BX // BX := CX % 16
|
||||||
SHRQ $3, CX
|
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||||
JZ absum_tail_start
|
JZ absum_tail_start // if CX == 0 { goto absum_tail_start }
|
||||||
|
|
||||||
absum_loop:
|
absum_loop: // do {
|
||||||
MOVSD (SI), X8
|
// p_sum = max( p_sum + x[i], p_sum - x[i] )
|
||||||
|
MOVSD (SI), X8 // X_i[0] = x[i]
|
||||||
MOVSD (SI)(AX*1), X9
|
MOVSD (SI)(AX*1), X9
|
||||||
MOVSD (SI)(AX*2), X10
|
MOVSD (SI)(AX*2), X10
|
||||||
MOVSD (SI)(DX*1), X11
|
MOVSD (SI)(DX*1), X11
|
||||||
LEAQ (SI)(AX*4), SI
|
LEAQ (SI)(AX*4), SI // SI = SI + 4
|
||||||
MOVHPD (SI), X8
|
MOVHPD (SI), X8 // X_i[1] = x[i+4]
|
||||||
MOVHPD (SI)(AX*1), X9
|
MOVHPD (SI)(AX*1), X9
|
||||||
MOVHPD (SI)(AX*2), X10
|
MOVHPD (SI)(AX*2), X10
|
||||||
MOVHPD (SI)(DX*1), X11
|
MOVHPD (SI)(DX*1), X11
|
||||||
ADDPD X8, X0
|
ADDPD X8, X0 // p_sum_i += X_i ( positive values )
|
||||||
ADDPD X9, X2
|
ADDPD X9, X2
|
||||||
ADDPD X10, X4
|
ADDPD X10, X4
|
||||||
ADDPD X11, X6
|
ADDPD X11, X6
|
||||||
SUBPD X8, X1
|
SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values )
|
||||||
SUBPD X9, X3
|
SUBPD X9, X3
|
||||||
SUBPD X10, X5
|
SUBPD X10, X5
|
||||||
SUBPD X11, X7
|
SUBPD X11, X7
|
||||||
MAXPD X1, X0
|
MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) )
|
||||||
MAXPD X3, X2
|
MAXPD X3, X2
|
||||||
MAXPD X5, X4
|
MAXPD X5, X4
|
||||||
MAXPD X7, X6
|
MAXPD X7, X6
|
||||||
MOVAPS X0, X1
|
MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i
|
||||||
MOVAPS X2, X3
|
MOVAPS X2, X3
|
||||||
MOVAPS X4, X5
|
MOVAPS X4, X5
|
||||||
MOVAPS X6, X7
|
MOVAPS X6, X7
|
||||||
LEAQ (SI)(AX*4), SI
|
LEAQ (SI)(AX*4), SI // SI = SI + 4
|
||||||
LOOP absum_loop
|
LOOP absum_loop // } while --CX > 0
|
||||||
ADDPD X3, X0
|
|
||||||
ADDPD X5, X7
|
// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
|
||||||
ADDPD X7, X0
|
ADDPD X3, X0
|
||||||
|
ADDPD X5, X7
|
||||||
|
ADDPD X7, X0
|
||||||
|
|
||||||
|
// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
|
||||||
MOVAPS X0, X1
|
MOVAPS X0, X1
|
||||||
SHUFPD $0x3, X0, X0
|
SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
|
||||||
ADDSD X1, X0
|
ADDSD X1, X0
|
||||||
MOVSD X0, X1
|
|
||||||
CMPQ BX, $0
|
CMPQ BX, $0
|
||||||
JE absum_end
|
JE absum_end // if BX == 0 { goto absum_end }
|
||||||
|
|
||||||
absum_tail_start:
|
absum_tail_start: // Reset loop registers
|
||||||
MOVQ BX, CX
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
XORPS X8, X8
|
XORPS X8, X8 // X_8 = 0
|
||||||
|
|
||||||
absum_tail:
|
absum_tail: // do {
|
||||||
MOVSD (SI), X8
|
// p_sum += max( p_sum + x[i], p_sum - x[i] )
|
||||||
ADDSD X8, X0
|
MOVSD (SI), X8 // X_8 = x[i]
|
||||||
SUBSD X8, X1
|
MOVSD X0, X1 // p_sum_1 = p_sum_0
|
||||||
MAXSD X1, X0
|
ADDSD X8, X0 // p_sum_0 += X_8
|
||||||
MOVSD X0, X1
|
SUBSD X8, X1 // p_sum_1 -= X_8
|
||||||
ADDQ AX, SI
|
MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 )
|
||||||
LOOP absum_tail
|
ADDQ AX, SI // i++
|
||||||
|
LOOP absum_tail // } while --CX > 0
|
||||||
|
|
||||||
absum_end:
|
absum_end: // return p_sum_0
|
||||||
MOVSD X1, sum+40(FP)
|
MOVSD X0, sum+40(FP)
|
||||||
RET
|
RET
|
||||||
|
@@ -8,59 +8,59 @@
|
|||||||
|
|
||||||
// func Add(dst, s []float64)
|
// func Add(dst, s []float64)
|
||||||
TEXT ·Add(SB), NOSPLIT, $0
|
TEXT ·Add(SB), NOSPLIT, $0
|
||||||
MOVQ dst_base+0(FP), DI
|
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||||
MOVQ dst_len+8(FP), CX
|
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||||
MOVQ s_base+24(FP), SI
|
MOVQ s_base+24(FP), SI // SI := &s
|
||||||
CMPQ s_len+32(FP), CX
|
CMPQ s_len+32(FP), CX // CX := max( CX, len(s) )
|
||||||
CMOVQLE s_len+32(FP), CX
|
CMOVQLE s_len+32(FP), CX
|
||||||
CMPQ CX, $0
|
CMPQ CX, $0 // if CX == 0 { return }
|
||||||
JE add_end
|
JE add_end
|
||||||
XORQ AX, AX
|
XORQ AX, AX
|
||||||
MOVQ DI, BX
|
MOVQ DI, BX
|
||||||
ANDQ $0x0F, BX
|
ANDQ $0x0F, BX // BX := &dst & 15
|
||||||
JZ add_no_trim
|
JZ add_no_trim // if BX == 0 { goto add_no_trim }
|
||||||
|
|
||||||
// Align on 16-bit boundary
|
// Align on 16-bit boundary
|
||||||
MOVSD (DI)(AX*8), X0
|
MOVSD (SI)(AX*8), X0 // X0 = s[i]
|
||||||
ADDSD (SI)(AX*8), X0
|
ADDSD (DI)(AX*8), X0 // X0 += dst[i]
|
||||||
MOVSD X0, (DI)(AX*8)
|
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||||
INCQ AX
|
INCQ AX // i++
|
||||||
DECQ CX
|
DECQ CX // --CX
|
||||||
JE add_end
|
JE add_end // if CX == 0 { return }
|
||||||
|
|
||||||
add_no_trim:
|
add_no_trim:
|
||||||
MOVQ CX, BX
|
MOVQ CX, BX
|
||||||
ANDQ $7, BX
|
ANDQ $7, BX // BX := CX % 16
|
||||||
SHRQ $3, CX
|
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||||
JZ add_tail_start
|
JZ add_tail_start // if CX == 0 { goto add_tail_start }
|
||||||
|
|
||||||
add_loop: // Loop unrolled 8x
|
add_loop: // Loop unrolled 8x do {
|
||||||
MOVUPS (SI)(AX*8), X0
|
MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1]
|
||||||
MOVUPS 16(SI)(AX*8), X1
|
MOVUPS 16(SI)(AX*8), X1
|
||||||
MOVUPS 32(SI)(AX*8), X2
|
MOVUPS 32(SI)(AX*8), X2
|
||||||
MOVUPS 48(SI)(AX*8), X3
|
MOVUPS 48(SI)(AX*8), X3
|
||||||
ADDPD (DI)(AX*8), X0
|
ADDPD (DI)(AX*8), X0 // X_i += dst[i:i+1]
|
||||||
ADDPD 16(DI)(AX*8), X1
|
ADDPD 16(DI)(AX*8), X1
|
||||||
ADDPD 32(DI)(AX*8), X2
|
ADDPD 32(DI)(AX*8), X2
|
||||||
ADDPD 48(DI)(AX*8), X3
|
ADDPD 48(DI)(AX*8), X3
|
||||||
MOVUPS X0, (DI)(AX*8)
|
MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X_i
|
||||||
MOVUPS X1, 16(DI)(AX*8)
|
MOVUPS X1, 16(DI)(AX*8)
|
||||||
MOVUPS X2, 32(DI)(AX*8)
|
MOVUPS X2, 32(DI)(AX*8)
|
||||||
MOVUPS X3, 48(DI)(AX*8)
|
MOVUPS X3, 48(DI)(AX*8)
|
||||||
ADDQ $8, AX
|
ADDQ $8, AX // i += 8
|
||||||
LOOP add_loop
|
LOOP add_loop // } while --CX > 0
|
||||||
CMPQ BX, $0
|
CMPQ BX, $0 // if BX == 0 { return }
|
||||||
JE add_end
|
JE add_end
|
||||||
|
|
||||||
add_tail_start:
|
add_tail_start: // Reset loop registers
|
||||||
MOVQ BX, CX
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
|
|
||||||
add_tail:
|
add_tail: // do {
|
||||||
MOVSD (DI)(AX*8), X0
|
MOVSD (SI)(AX*8), X0 // X0 = s[i]
|
||||||
ADDSD (SI)(AX*8), X0
|
ADDSD (DI)(AX*8), X0 // X0 += dst[i]
|
||||||
MOVSD X0, (DI)(AX*8)
|
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||||
INCQ AX
|
INCQ AX // ++i
|
||||||
LOOP add_tail
|
LOOP add_tail // } while --CX > 0
|
||||||
|
|
||||||
add_end:
|
add_end:
|
||||||
RET
|
RET
|
||||||
|
@@ -8,46 +8,46 @@
|
|||||||
|
|
||||||
// func Addconst(alpha float64, x []float64)
|
// func Addconst(alpha float64, x []float64)
|
||||||
TEXT ·AddConst(SB), NOSPLIT, $0
|
TEXT ·AddConst(SB), NOSPLIT, $0
|
||||||
MOVQ x_base+8(FP), SI
|
MOVQ x_base+8(FP), SI // SI := &x
|
||||||
MOVQ x_len+16(FP), CX
|
MOVQ x_len+16(FP), CX // CX := len(x)
|
||||||
CMPQ CX, $0
|
CMPQ CX, $0 // if len(x) == 0 { return }
|
||||||
JE ac_end
|
JE ac_end
|
||||||
MOVSD alpha+0(FP), X4
|
MOVSD alpha+0(FP), X4 // X4 = { a, a }
|
||||||
SHUFPD $0, X4, X4
|
SHUFPD $0, X4, X4
|
||||||
MOVUPS X4, X5
|
MOVUPS X4, X5 // X5 = X4
|
||||||
XORQ AX, AX
|
XORQ AX, AX // i = 0
|
||||||
MOVQ CX, BX
|
MOVQ CX, BX
|
||||||
ANDQ $7, BX
|
ANDQ $7, BX // BX := len(x) % 16
|
||||||
SHRQ $3, CX
|
SHRQ $3, CX // CX := floor( CX / 16 )
|
||||||
JZ ac_tail_start
|
JZ ac_tail_start // if CX == 0 { goto ac_tail_start }
|
||||||
|
|
||||||
ac_loop:
|
ac_loop: // Loop unrolled 8x do {
|
||||||
MOVUPS (SI)(AX*8), X0
|
MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1]
|
||||||
MOVUPS 16(SI)(AX*8), X1
|
MOVUPS 16(SI)(AX*8), X1
|
||||||
MOVUPS 32(SI)(AX*8), X2
|
MOVUPS 32(SI)(AX*8), X2
|
||||||
MOVUPS 48(SI)(AX*8), X3
|
MOVUPS 48(SI)(AX*8), X3
|
||||||
ADDPD X4, X0
|
ADDPD X4, X0 // X_i += a
|
||||||
ADDPD X5, X1
|
ADDPD X5, X1
|
||||||
ADDPD X4, X2
|
ADDPD X4, X2
|
||||||
ADDPD X5, X3
|
ADDPD X5, X3
|
||||||
MOVUPS X0, (SI)(AX*8)
|
MOVUPS X0, (SI)(AX*8) // s[i:i+1] = X_i
|
||||||
MOVUPS X1, 16(SI)(AX*8)
|
MOVUPS X1, 16(SI)(AX*8)
|
||||||
MOVUPS X2, 32(SI)(AX*8)
|
MOVUPS X2, 32(SI)(AX*8)
|
||||||
MOVUPS X3, 48(SI)(AX*8)
|
MOVUPS X3, 48(SI)(AX*8)
|
||||||
ADDQ $8, AX
|
ADDQ $8, AX // i += 8
|
||||||
LOOP ac_loop
|
LOOP ac_loop // } while --CX > 0
|
||||||
CMPQ BX, $0
|
CMPQ BX, $0 // if BX == 0 { return }
|
||||||
JE ac_end
|
JE ac_end
|
||||||
|
|
||||||
ac_tail_start:
|
ac_tail_start: // Reset loop counters
|
||||||
MOVQ BX, CX
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
|
|
||||||
ac_tail:
|
ac_tail: // do {
|
||||||
MOVSD (SI)(AX*8), X0
|
MOVSD (SI)(AX*8), X0 // X0 = s[i]
|
||||||
ADDSD X4, X0
|
ADDSD X4, X0 // X0 += a
|
||||||
MOVSD X0, (SI)(AX*8)
|
MOVSD X0, (SI)(AX*8) // s[i] = X0
|
||||||
INCQ AX
|
INCQ AX // ++i
|
||||||
LOOP ac_tail
|
LOOP ac_tail // } while --CX > 0
|
||||||
|
|
||||||
ac_end:
|
ac_end:
|
||||||
RET
|
RET
|
||||||
|
@@ -7,63 +7,65 @@
|
|||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
TEXT ·CumProd(SB), NOSPLIT, $0
|
TEXT ·CumProd(SB), NOSPLIT, $0
|
||||||
MOVQ dst_base+0(FP), DI
|
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||||
MOVQ dst_len+8(FP), CX
|
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||||
MOVQ s_base+24(FP), SI
|
MOVQ s_base+24(FP), SI // SI := &s
|
||||||
CMPQ s_len+32(FP), CX
|
CMPQ s_len+32(FP), CX // CX := max( CX, len(s) )
|
||||||
CMOVQLE s_len+32(FP), CX
|
CMOVQLE s_len+32(FP), CX
|
||||||
MOVQ CX, ret_len+56(FP)
|
MOVQ CX, ret_len+56(FP) // len(ret) = CX
|
||||||
CMPQ CX, $0
|
CMPQ CX, $0 // if CX == 0 { return }
|
||||||
JE cs_end
|
JE cp_end
|
||||||
XORQ AX, AX
|
XORQ AX, AX // i := 0
|
||||||
|
|
||||||
MOVSD (SI), X5
|
MOVSD (SI), X5 // p_prod = { s[0], s[0] }
|
||||||
SHUFPD $0, X5, X5
|
SHUFPD $0, X5, X5
|
||||||
MOVSD X5, (DI)
|
MOVSD X5, (DI) // dst[0] = s[0]
|
||||||
INCQ AX
|
INCQ AX // ++i
|
||||||
DECQ CX
|
DECQ CX // -- CX
|
||||||
JZ cs_end
|
JZ cp_end // if CX == 0 { return }
|
||||||
|
|
||||||
MOVQ CX, BX
|
MOVQ CX, BX
|
||||||
ANDQ $3, BX
|
ANDQ $3, BX // BX := CX % 4
|
||||||
SHRQ $2, CX
|
SHRQ $2, CX // CX = floor( CX / 4 )
|
||||||
JZ cs_tail_start
|
JZ cp_tail_start // if CX == 0 { goto cp_tail_start }
|
||||||
|
|
||||||
cs_loop:
|
cp_loop: // Loop unrolled 4x do {
|
||||||
MOVUPS (SI)(AX*8), X0
|
MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1]
|
||||||
MOVUPS 16(SI)(AX*8), X2
|
MOVUPS 16(SI)(AX*8), X2
|
||||||
MOVAPS X0, X1
|
MOVAPS X0, X1 // X1 = X0
|
||||||
MOVAPS X2, X3
|
MOVAPS X2, X3
|
||||||
SHUFPD $1, X1, X1
|
SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] }
|
||||||
SHUFPD $1, X3, X3
|
SHUFPD $1, X3, X3
|
||||||
MULPD X0, X1
|
MULPD X0, X1 // X1 *= X0
|
||||||
MULPD X2, X3
|
MULPD X2, X3
|
||||||
SHUFPD $2, X1, X0
|
SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] }
|
||||||
SHUFPD $3, X1, X1
|
SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] }
|
||||||
SHUFPD $2, X3, X2
|
SHUFPD $2, X3, X2
|
||||||
SHUFPD $3, X3, X3
|
SHUFPD $3, X3, X3
|
||||||
MULPD X5, X0
|
MULPD X5, X0 // X0 *= p_prod
|
||||||
MULPD X1, X5
|
MULPD X1, X5 // p_prod *= X1
|
||||||
MULPD X5, X2
|
MULPD X5, X2
|
||||||
MOVUPS X0, (DI)(AX*8)
|
MOVUPS X0, (DI)(AX*8) // dst[i] = X0
|
||||||
MOVUPS X2, 16(DI)(AX*8)
|
MOVUPS X2, 16(DI)(AX*8)
|
||||||
MULPD X3, X5
|
MULPD X3, X5
|
||||||
ADDQ $4, AX
|
ADDQ $4, AX // i += 4
|
||||||
LOOP cs_loop
|
LOOP cp_loop // } while --CX > 0
|
||||||
CMPQ BX, $0
|
|
||||||
JE cs_end
|
|
||||||
|
|
||||||
cs_tail_start:
|
// if BX == 0 { return }
|
||||||
MOVQ BX, CX
|
CMPQ BX, $0
|
||||||
|
JE cp_end
|
||||||
|
|
||||||
cs_tail:
|
cp_tail_start: // Reset loop registers
|
||||||
MULSD (SI)(AX*8), X5
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
MOVSD X5, (DI)(AX*8)
|
|
||||||
INCQ AX
|
|
||||||
LOOP cs_tail
|
|
||||||
|
|
||||||
cs_end:
|
cp_tail: // do {
|
||||||
MOVQ DI, ret_base+48(FP)
|
MULSD (SI)(AX*8), X5 // p_prod *= s[i]
|
||||||
MOVQ dst_cap+16(FP), SI
|
MOVSD X5, (DI)(AX*8) // dst[i] = p_prod
|
||||||
|
INCQ AX // ++i
|
||||||
|
LOOP cp_tail // } while --CX > 0
|
||||||
|
|
||||||
|
cp_end:
|
||||||
|
MOVQ DI, ret_base+48(FP) // &ret = &dst
|
||||||
|
MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst)
|
||||||
MOVQ SI, ret_cap+64(FP)
|
MOVQ SI, ret_cap+64(FP)
|
||||||
RET
|
RET
|
||||||
|
@@ -7,56 +7,58 @@
|
|||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
TEXT ·CumSum(SB), NOSPLIT, $0
|
TEXT ·CumSum(SB), NOSPLIT, $0
|
||||||
MOVQ dst_base+0(FP), DI
|
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||||
MOVQ dst_len+8(FP), CX
|
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||||
MOVQ s_base+24(FP), SI
|
MOVQ s_base+24(FP), SI // SI := &s
|
||||||
CMPQ s_len+32(FP), CX
|
CMPQ s_len+32(FP), CX // CX := max( CX, len(s) )
|
||||||
CMOVQLE s_len+32(FP), CX
|
CMOVQLE s_len+32(FP), CX
|
||||||
MOVQ CX, ret_len+56(FP)
|
MOVQ CX, ret_len+56(FP) // len(ret) = CX
|
||||||
CMPQ CX, $0
|
CMPQ CX, $0 // if CX == 0 { return }
|
||||||
JE cs_end
|
JE cs_end
|
||||||
XORQ AX, AX
|
XORQ AX, AX // i := 0
|
||||||
PXOR X5, X5
|
PXOR X5, X5 // p_sum = 0
|
||||||
MOVQ CX, BX
|
MOVQ CX, BX
|
||||||
ANDQ $3, BX
|
ANDQ $3, BX // BX := CX % 4
|
||||||
SHRQ $2, CX
|
SHRQ $2, CX // CX = floor( CX / 4 )
|
||||||
JZ cs_tail_start
|
JZ cs_tail_start // if CX == 0 { goto cs_tail_start }
|
||||||
|
|
||||||
cs_loop:
|
cs_loop: // Loop unrolled 4x do {
|
||||||
MOVUPS (SI)(AX*8), X0
|
MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1]
|
||||||
MOVUPS 16(SI)(AX*8), X2
|
MOVUPS 16(SI)(AX*8), X2
|
||||||
MOVAPS X0, X1
|
MOVAPS X0, X1 // X1 = X0
|
||||||
MOVAPS X2, X3
|
MOVAPS X2, X3
|
||||||
SHUFPD $1, X1, X1
|
SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] }
|
||||||
SHUFPD $1, X3, X3
|
SHUFPD $1, X3, X3
|
||||||
ADDPD X0, X1
|
ADDPD X0, X1 // X1 += X0
|
||||||
ADDPD X2, X3
|
ADDPD X2, X3
|
||||||
SHUFPD $2, X1, X0
|
SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] }
|
||||||
SHUFPD $3, X1, X1
|
SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] }
|
||||||
SHUFPD $2, X3, X2
|
SHUFPD $2, X3, X2
|
||||||
SHUFPD $3, X3, X3
|
SHUFPD $3, X3, X3
|
||||||
ADDPD X5, X0
|
ADDPD X5, X0 // X0 += p_sum
|
||||||
ADDPD X1, X5
|
ADDPD X1, X5 // p_sum += X1
|
||||||
ADDPD X5, X2
|
ADDPD X5, X2
|
||||||
MOVUPS X0, (DI)(AX*8)
|
MOVUPS X0, (DI)(AX*8) // dst[i] = X0
|
||||||
MOVUPS X2, 16(DI)(AX*8)
|
MOVUPS X2, 16(DI)(AX*8)
|
||||||
ADDPD X3, X5
|
ADDPD X3, X5
|
||||||
ADDQ $4, AX
|
ADDQ $4, AX // i += 4
|
||||||
LOOP cs_loop
|
LOOP cs_loop // } while --CX > 0
|
||||||
CMPQ BX, $0
|
|
||||||
JE cs_end
|
|
||||||
|
|
||||||
cs_tail_start:
|
// if BX == 0 { return }
|
||||||
MOVQ BX, CX
|
CMPQ BX, $0
|
||||||
|
JE cs_end
|
||||||
|
|
||||||
cs_tail:
|
cs_tail_start: // Reset loop registers
|
||||||
ADDSD (SI)(AX*8), X5
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
MOVSD X5, (DI)(AX*8)
|
|
||||||
INCQ AX
|
cs_tail: // do {
|
||||||
LOOP cs_tail
|
ADDSD (SI)(AX*8), X5 // p_sum *= s[i]
|
||||||
|
MOVSD X5, (DI)(AX*8) // dst[i] = p_sum
|
||||||
|
INCQ AX // ++i
|
||||||
|
LOOP cs_tail // } while --CX > 0
|
||||||
|
|
||||||
cs_end:
|
cs_end:
|
||||||
MOVQ DI, ret_base+48(FP)
|
MOVQ DI, ret_base+48(FP) // &ret = &dst
|
||||||
MOVQ dst_cap+16(FP), SI
|
MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst)
|
||||||
MOVQ SI, ret_cap+64(FP)
|
MOVQ SI, ret_cap+64(FP)
|
||||||
RET
|
RET
|
||||||
|
@@ -8,59 +8,59 @@
|
|||||||
|
|
||||||
// func Div(dst, s []float64)
|
// func Div(dst, s []float64)
|
||||||
TEXT ·Div(SB), NOSPLIT, $0
|
TEXT ·Div(SB), NOSPLIT, $0
|
||||||
MOVQ dst_base+0(FP), DI
|
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||||
MOVQ dst_len+8(FP), CX
|
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||||
MOVQ s_base+24(FP), SI
|
MOVQ s_base+24(FP), SI // SI := &s
|
||||||
CMPQ s_len+32(FP), CX
|
CMPQ s_len+32(FP), CX // CX = max( CX, len(s) )
|
||||||
CMOVQLE s_len+32(FP), CX
|
CMOVQLE s_len+32(FP), CX
|
||||||
CMPQ CX, $0
|
CMPQ CX, $0 // if CX == 0 { return }
|
||||||
JE div_end
|
JE div_end
|
||||||
XORQ AX, AX
|
XORQ AX, AX // i := 0
|
||||||
MOVQ SI, BX
|
MOVQ SI, BX
|
||||||
ANDQ $15, BX
|
ANDQ $15, BX // BX := &s & 15
|
||||||
JZ div_no_trim
|
JZ div_no_trim // if BX == 0 { goto div_no_trim }
|
||||||
|
|
||||||
// Align on 16-bit boundary
|
// Align on 16-bit boundary
|
||||||
MOVSD (DI)(AX*8), X0
|
MOVSD (DI)(AX*8), X0 // X0 := dst[i]
|
||||||
DIVSD (SI)(AX*8), X0
|
DIVSD (SI)(AX*8), X0 // X0 /= s[i]
|
||||||
MOVSD X0, (DI)(AX*8)
|
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||||
INCQ AX
|
INCQ AX // ++i
|
||||||
DECQ CX
|
DECQ CX // --CX
|
||||||
JZ div_end
|
JZ div_end // if CX == 0 { return }
|
||||||
|
|
||||||
div_no_trim:
|
div_no_trim:
|
||||||
MOVQ CX, BX
|
MOVQ CX, BX
|
||||||
ANDQ $7, BX
|
ANDQ $7, BX // BX = CX % 16
|
||||||
SHRQ $3, CX
|
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||||
JZ div_tail_start
|
JZ div_tail_start // if CX == 0 { goto div_tail_start }
|
||||||
|
|
||||||
div_loop: // Loop unrolled 8x
|
div_loop: // Loop unrolled 8x do {
|
||||||
MOVUPS (SI)(AX*8), X0
|
MOVUPS (DI)(AX*8), X0 // X0 := dst[i:i+1]
|
||||||
MOVUPS 16(SI)(AX*8), X1
|
MOVUPS 16(DI)(AX*8), X1
|
||||||
MOVUPS 32(SI)(AX*8), X2
|
MOVUPS 32(DI)(AX*8), X2
|
||||||
MOVUPS 48(SI)(AX*8), X3
|
MOVUPS 48(DI)(AX*8), X3
|
||||||
DIVPD (DI)(AX*8), X0
|
DIVPD (SI)(AX*8), X0 // X0 /= s[i:i+1]
|
||||||
DIVPD 16(DI)(AX*8), X1
|
DIVPD 16(SI)(AX*8), X1
|
||||||
DIVPD 32(DI)(AX*8), X2
|
DIVPD 32(SI)(AX*8), X2
|
||||||
DIVPD 48(DI)(AX*8), X3
|
DIVPD 48(SI)(AX*8), X3
|
||||||
MOVUPS X0, (DI)(AX*8)
|
MOVUPS X0, (DI)(AX*8) // dst[i] = X0
|
||||||
MOVUPS X1, 16(DI)(AX*8)
|
MOVUPS X1, 16(DI)(AX*8)
|
||||||
MOVUPS X2, 32(DI)(AX*8)
|
MOVUPS X2, 32(DI)(AX*8)
|
||||||
MOVUPS X3, 48(DI)(AX*8)
|
MOVUPS X3, 48(DI)(AX*8)
|
||||||
ADDQ $4, AX
|
ADDQ $8, AX // i += 8
|
||||||
LOOP div_loop
|
LOOP div_loop // } while --CX > 0
|
||||||
CMPQ BX, $0
|
CMPQ BX, $0 // if BX == 0 { return }
|
||||||
JE div_end
|
JE div_end
|
||||||
|
|
||||||
div_tail_start:
|
div_tail_start: // Reset loop registers
|
||||||
MOVQ BX, CX
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
|
|
||||||
div_tail:
|
div_tail: // do {
|
||||||
MOVSD (DI)(AX*8), X0
|
MOVSD (DI)(AX*8), X0 // X0 = dst[i]
|
||||||
DIVSD (SI)(AX*8), X0
|
DIVSD (SI)(AX*8), X0 // X0 /= s[i]
|
||||||
MOVSD X0, (DI)(AX*8)
|
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||||
INCQ AX
|
INCQ AX // ++i
|
||||||
LOOP div_tail
|
LOOP div_tail // } while --CX > 0
|
||||||
|
|
||||||
div_end:
|
div_end:
|
||||||
RET
|
RET
|
||||||
|
@@ -8,66 +8,66 @@
|
|||||||
|
|
||||||
// func DivTo(dst, x, y []float64)
|
// func DivTo(dst, x, y []float64)
|
||||||
TEXT ·DivTo(SB), NOSPLIT, $0
|
TEXT ·DivTo(SB), NOSPLIT, $0
|
||||||
MOVQ dst_base+0(FP), DI
|
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||||
MOVQ dst_len+8(FP), CX
|
MOVQ dst_len+8(FP), CX // CX := len(dst)
|
||||||
MOVQ x_base+24(FP), SI
|
MOVQ x_base+24(FP), SI // SI := &x
|
||||||
MOVQ y_base+48(FP), DX
|
MOVQ y_base+48(FP), DX // DX := &y
|
||||||
CMPQ x_len+32(FP), CX
|
CMPQ x_len+32(FP), CX // CX = max( len(dst), len(x), len(y) )
|
||||||
CMOVQLE x_len+32(FP), CX
|
CMOVQLE x_len+32(FP), CX
|
||||||
CMPQ y_len+56(FP), CX
|
CMPQ y_len+56(FP), CX
|
||||||
CMOVQLE y_len+56(FP), CX
|
CMOVQLE y_len+56(FP), CX
|
||||||
MOVQ CX, ret_len+80(FP)
|
MOVQ CX, ret_len+80(FP) // len(ret) = CX
|
||||||
CMPQ CX, $0
|
CMPQ CX, $0 // if CX == 0 { return }
|
||||||
JE div_end
|
JE div_end
|
||||||
XORQ AX, AX
|
XORQ AX, AX // i := 0
|
||||||
MOVQ DI, BX
|
MOVQ DI, BX
|
||||||
ANDQ $15, BX
|
ANDQ $15, BX // BX := &dst & OxF
|
||||||
JZ div_no_trim
|
JZ div_no_trim // if BX == 0 { goto div_no_trim }
|
||||||
|
|
||||||
// Align on 16-bit boundary
|
// Align on 16-bit boundary
|
||||||
MOVSD (SI)(AX*8), X0
|
MOVSD (SI)(AX*8), X0 // X0 := s[i]
|
||||||
DIVSD (DX)(AX*8), X0
|
DIVSD (DX)(AX*8), X0 // X0 /= t[i]
|
||||||
MOVSD X0, (DI)(AX*8)
|
MOVSD X0, (DI)(AX*8) // dst[i] = X0
|
||||||
INCQ AX
|
INCQ AX // ++i
|
||||||
DECQ CX
|
DECQ CX // --CX
|
||||||
JZ div_end // */
|
JZ div_end // if CX == 0 { return }
|
||||||
|
|
||||||
div_no_trim:
|
div_no_trim:
|
||||||
MOVQ CX, BX
|
MOVQ CX, BX
|
||||||
ANDQ $7, BX
|
ANDQ $7, BX // BX = CX % 16
|
||||||
SHRQ $3, CX
|
SHRQ $3, CX // CX = floor( CX / 16 )
|
||||||
JZ div_tail_start
|
JZ div_tail_start // if CX == 0 { goto div_tail_start }
|
||||||
|
|
||||||
div_loop: // Unroll 8x
|
div_loop: // Loop unrolled 8x do {
|
||||||
MOVUPS (SI)(AX*8), X0
|
MOVUPS (SI)(AX*8), X0 // X0 := x[i:i+1]
|
||||||
MOVUPS 16(SI)(AX*8), X1
|
MOVUPS 16(SI)(AX*8), X1
|
||||||
MOVUPS 32(SI)(AX*8), X2
|
MOVUPS 32(SI)(AX*8), X2
|
||||||
MOVUPS 48(SI)(AX*8), X3
|
MOVUPS 48(SI)(AX*8), X3
|
||||||
DIVPD (DX)(AX*8), X0
|
DIVPD (DX)(AX*8), X0 // X0 /= y[i:i+1]
|
||||||
DIVPD 16(DX)(AX*8), X1
|
DIVPD 16(DX)(AX*8), X1
|
||||||
DIVPD 32(DX)(AX*8), X2
|
DIVPD 32(DX)(AX*8), X2
|
||||||
DIVPD 48(DX)(AX*8), X3
|
DIVPD 48(DX)(AX*8), X3
|
||||||
MOVUPS X0, (DI)(AX*8)
|
MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X0
|
||||||
MOVUPS X1, 16(DI)(AX*8)
|
MOVUPS X1, 16(DI)(AX*8)
|
||||||
MOVUPS X2, 32(DI)(AX*8)
|
MOVUPS X2, 32(DI)(AX*8)
|
||||||
MOVUPS X3, 48(DI)(AX*8)
|
MOVUPS X3, 48(DI)(AX*8)
|
||||||
ADDQ $8, AX
|
ADDQ $8, AX // i += 8
|
||||||
LOOP div_loop
|
LOOP div_loop // } while --CX > 0
|
||||||
CMPQ CX, $0
|
CMPQ BX, $0 // if BX == 0 { return }
|
||||||
JE div_end
|
JE div_end
|
||||||
|
|
||||||
div_tail_start:
|
div_tail_start: // Reset loop registers
|
||||||
MOVQ BX, CX
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
|
|
||||||
div_tail:
|
div_tail: // do {
|
||||||
MOVSD (SI)(AX*8), X0
|
MOVSD (SI)(AX*8), X0 // X0 = x[i]
|
||||||
DIVSD (DX)(AX*8), X0
|
DIVSD (DX)(AX*8), X0 // X0 /= y[i]
|
||||||
MOVSD X0, (DI)(AX*8)
|
MOVSD X0, (DI)(AX*8)
|
||||||
INCQ AX
|
INCQ AX // ++i
|
||||||
LOOP div_tail
|
LOOP div_tail // } while --CX > 0
|
||||||
|
|
||||||
div_end:
|
div_end:
|
||||||
MOVQ DI, ret_base+72(FP)
|
MOVQ DI, ret_base+72(FP) // &ret = &dst
|
||||||
MOVQ dst_cap+16(FP), DI
|
MOVQ dst_cap+16(FP), DI // cap(ret) = cap(dst)
|
||||||
MOVQ DI, ret_cap+88(FP)
|
MOVQ DI, ret_cap+88(FP)
|
||||||
RET
|
RET
|
||||||
|
@@ -8,46 +8,51 @@
|
|||||||
|
|
||||||
// func L1Norm(s, t []float64) float64
|
// func L1Norm(s, t []float64) float64
|
||||||
TEXT ·L1Norm(SB), NOSPLIT, $0
|
TEXT ·L1Norm(SB), NOSPLIT, $0
|
||||||
MOVQ s_base+0(FP), DI
|
MOVQ s_base+0(FP), DI // DI := &s
|
||||||
MOVQ t_base+24(FP), SI
|
MOVQ t_base+24(FP), SI // SI := &t
|
||||||
MOVQ s_len+8(FP), DX
|
MOVQ s_len+8(FP), CX // CX := len(s)
|
||||||
CMPQ t_len+32(FP), DX
|
CMPQ t_len+32(FP), CX // CX = max( CX, len(t) )
|
||||||
CMOVQLE t_len+32(FP), DX
|
CMOVQLE t_len+32(FP), CX
|
||||||
PXOR X3, X3
|
PXOR X3, X3 // norm := 0
|
||||||
XORQ AX, AX
|
CMPQ CX, $0 // if CX == 0 { return 0 }
|
||||||
CMPQ DX, $1
|
JE l1_end
|
||||||
JL l1_end
|
XORQ AX, AX // i := 0
|
||||||
SUBQ $1, DX
|
MOVQ CX, BX
|
||||||
JE l1_tail
|
ANDQ $1, BX // BX := CX % 2
|
||||||
|
SHRQ $1, CX // CX := floor( CX / 2 )
|
||||||
|
JZ l1_tail_start // if CX == 0 { return 0 }
|
||||||
|
|
||||||
l1_loop:
|
l1_loop: // Loop unrolled 2x do {
|
||||||
MOVUPS (SI)(AX*8), X0
|
MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
|
||||||
MOVUPS (DI)(AX*8), X1
|
MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
|
||||||
MOVAPS X0, X2
|
MOVAPS X0, X2
|
||||||
SUBPD X1, X0
|
SUBPD X1, X0
|
||||||
SUBPD X2, X1
|
SUBPD X2, X1
|
||||||
MAXPD X1, X0
|
MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
|
||||||
ADDPD X0, X3
|
ADDPD X0, X3 // norm += X0
|
||||||
ADDQ $2, AX
|
ADDQ $2, AX // i += 2
|
||||||
CMPQ AX, DX
|
LOOP l1_loop // } while --CX > 0
|
||||||
JL l1_loop
|
CMPQ BX, $0 // if BX == 0 { return }
|
||||||
JG l1_end
|
JE l1_end
|
||||||
|
|
||||||
|
l1_tail_start: // Reset loop registers
|
||||||
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
|
PXOR X0, X0 // reset X0, X1 to break dependencies
|
||||||
|
PXOR X1, X1
|
||||||
|
|
||||||
l1_tail:
|
l1_tail:
|
||||||
PXOR X0, X0
|
MOVSD (SI)(AX*8), X0 // X0 = t[i]
|
||||||
PXOR X1, X1
|
MOVSD (DI)(AX*8), X1 // x1 = s[i]
|
||||||
MOVSD (SI)(AX*8), X0
|
|
||||||
MOVSD (DI)(AX*8), X1
|
|
||||||
MOVAPD X0, X2
|
MOVAPD X0, X2
|
||||||
SUBSD X1, X0
|
SUBSD X1, X0
|
||||||
SUBSD X2, X1
|
SUBSD X2, X1
|
||||||
MAXSD X1, X0
|
MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
|
||||||
ADDSD X0, X3
|
ADDSD X0, X3 // norm += X0
|
||||||
|
|
||||||
l1_end:
|
l1_end:
|
||||||
MOVAPS X3, X2
|
MOVAPS X3, X2
|
||||||
SHUFPD $1, X2, X2
|
SHUFPD $1, X2, X2
|
||||||
ADDSD X3, X2
|
ADDSD X3, X2 // X2 = X3[1] + X3[0]
|
||||||
MOVSD X2, ret+48(FP)
|
MOVSD X2, ret+48(FP) // return X2
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
@@ -8,45 +8,50 @@
|
|||||||
|
|
||||||
// func LinfNorm(s, t []float64) float64
|
// func LinfNorm(s, t []float64) float64
|
||||||
TEXT ·LinfNorm(SB), NOSPLIT, $0
|
TEXT ·LinfNorm(SB), NOSPLIT, $0
|
||||||
MOVQ s_base+0(FP), DI
|
MOVQ s_base+0(FP), DI // DI := &s
|
||||||
MOVQ t_base+24(FP), SI
|
MOVQ t_base+24(FP), SI // SI := &t
|
||||||
MOVQ s_len+8(FP), DX
|
MOVQ s_len+8(FP), CX // CX := len(s)
|
||||||
CMPQ t_len+32(FP), DX
|
CMPQ t_len+32(FP), CX // CX = max( CX, len(t) )
|
||||||
CMOVQLE t_len+32(FP), DX
|
CMOVQLE t_len+32(FP), CX
|
||||||
PXOR X3, X3
|
PXOR X3, X3 // norm := 0
|
||||||
XORQ AX, AX
|
CMPQ CX, $0 // if CX == 0 { return 0 }
|
||||||
CMPQ DX, $1
|
JE l1_end
|
||||||
JL l1_end
|
XORQ AX, AX // i := 0
|
||||||
SUBQ $1, DX
|
MOVQ CX, BX
|
||||||
JE l1_tail
|
ANDQ $1, BX // BX := CX % 2
|
||||||
|
SHRQ $1, CX // CX := floor( CX / 2 )
|
||||||
|
JZ l1_tail_start // if CX == 0 { return 0 }
|
||||||
|
|
||||||
l1_loop:
|
l1_loop: // Loop unrolled 2x do {
|
||||||
MOVUPS (SI)(AX*8), X0
|
MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
|
||||||
MOVUPS (DI)(AX*8), X1
|
MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
|
||||||
MOVAPS X0, X2
|
MOVAPS X0, X2
|
||||||
SUBPD X1, X0
|
SUBPD X1, X0
|
||||||
SUBPD X2, X1
|
SUBPD X2, X1
|
||||||
MAXPD X1, X0
|
MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
|
||||||
MAXPD X0, X3
|
MAXPD X0, X3 // norm = max( norm, X0 )
|
||||||
ADDQ $2, AX
|
ADDQ $2, AX // i += 2
|
||||||
CMPQ AX, DX
|
LOOP l1_loop // } while --CX > 0
|
||||||
JL l1_loop
|
CMPQ BX, $0 // if BX == 0 { return }
|
||||||
JG l1_end
|
JE l1_end
|
||||||
|
|
||||||
|
l1_tail_start: // Reset loop registers
|
||||||
|
MOVQ BX, CX // Loop counter: CX = BX
|
||||||
|
PXOR X0, X0 // reset X0, X1 to break dependencies
|
||||||
|
PXOR X1, X1
|
||||||
|
|
||||||
l1_tail:
|
l1_tail:
|
||||||
PXOR X0, X0
|
MOVSD (SI)(AX*8), X0 // X0 = t[i]
|
||||||
PXOR X1, X1
|
MOVSD (DI)(AX*8), X1 // X1 = s[i]
|
||||||
MOVSD (SI)(AX*8), X0
|
|
||||||
MOVSD (DI)(AX*8), X1
|
|
||||||
MOVAPD X0, X2
|
MOVAPD X0, X2
|
||||||
SUBSD X1, X0
|
SUBSD X1, X0
|
||||||
SUBSD X2, X1
|
SUBSD X2, X1
|
||||||
MAXSD X1, X0
|
MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 )
|
||||||
MAXSD X0, X3
|
MAXSD X0, X3 // norm = max( norm, X0 )
|
||||||
|
|
||||||
l1_end:
|
l1_end:
|
||||||
MOVAPS X3, X2
|
MOVAPS X3, X2
|
||||||
SHUFPD $1, X2, X2
|
SHUFPD $1, X2, X2
|
||||||
MAXSD X3, X2
|
MAXSD X3, X2 // X2 = max( X3[1], X3[0] )
|
||||||
MOVSD X2, ret+48(FP)
|
MOVSD X2, ret+48(FP) // return X2
|
||||||
RET
|
RET
|
||||||
|
@@ -447,6 +447,11 @@ func TestDiv(t *testing.T) {
|
|||||||
src: []float64{1, 2, 3, 4},
|
src: []float64{1, 2, 3, 4},
|
||||||
expect: []float64{1, 1, 1, 1},
|
expect: []float64{1, 1, 1, 1},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
dst: []float64{1, 2, 3, 4, 2, 4, 6, 8},
|
||||||
|
src: []float64{1, 2, 3, 4, 1, 2, 3, 4},
|
||||||
|
expect: []float64{1, 1, 1, 1, 2, 2, 2, 2},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
dst: []float64{2, 4, 6},
|
dst: []float64{2, 4, 6},
|
||||||
src: []float64{1, 2, 3},
|
src: []float64{1, 2, 3},
|
||||||
|
Reference in New Issue
Block a user