mirror of
https://github.com/gonum/gonum.git
synced 2025-10-20 21:59:25 +08:00
asm/c128: Commenting asm code
This commit is contained in:
@@ -26,85 +26,106 @@
|
||||
|
||||
// func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
|
||||
TEXT ·AxpyInc(SB), NOSPLIT, $0
|
||||
MOVQ x_base+16(FP), SI
|
||||
MOVQ y_base+40(FP), DI
|
||||
MOVQ n+64(FP), CX
|
||||
CMPQ CX, $0 // if n==0, return
|
||||
MOVQ x_base+16(FP), SI // SI := &x
|
||||
MOVQ y_base+40(FP), DI // DI := y
|
||||
MOVQ n+64(FP), CX // CX := n
|
||||
CMPQ CX, $0 // if n==0 { return }
|
||||
JE axpyi_end
|
||||
MOVQ ix+88(FP), R8 // Load the first indicies
|
||||
MOVQ ix+88(FP), R8 // Load the first index
|
||||
SHLQ $1, R8 // Double to adjust for 16-byte size
|
||||
MOVQ iy+96(FP), R9
|
||||
SHLQ $1, R9
|
||||
LEAQ (SI)(R8*8), SI // Calculate addrress of first indicies
|
||||
LEAQ (DI)(R9*8), DI
|
||||
LEAQ (SI)(R8*8), SI // SI = &(x[ix])
|
||||
LEAQ (DI)(R9*8), DI // DI = &(y[iy])
|
||||
MOVQ incX+72(FP), R8 // Incrementors*16 for easy iteration (ADDQ)
|
||||
SHLQ $4, R8
|
||||
MOVQ incY+80(FP), R9
|
||||
SHLQ $4, R9
|
||||
MOVUPS alpha+0(FP), X0 // (ar,ai)
|
||||
MOVUPS alpha+0(FP), X0 // X0 := { imag(a), real(a) }
|
||||
MOVAPS X0, X1
|
||||
SHUFPD $0x1, X1, X1 // (ai,ar)
|
||||
MOVAPS X0, X10
|
||||
SHUFPD $0x1, X1, X1 // X1 := { real(a), imag(a) }
|
||||
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
|
||||
MOVAPS X1, X11
|
||||
MOVQ CX, BX
|
||||
ANDQ $3, CX
|
||||
SHRQ $2, BX
|
||||
JZ axpyi_tail
|
||||
ANDQ $3, CX // CX = floor( CX / 4 )
|
||||
SHRQ $2, BX // BX = CX % 4
|
||||
JZ axpyi_tail // if BX == 0 { goto caxy_tail }
|
||||
|
||||
axpyi_loop:
|
||||
MOVUPS (SI), X2
|
||||
axpyi_loop: // do {
|
||||
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
|
||||
MOVUPS (SI)(R8*1), X4
|
||||
LEAQ (SI)(R8*2), SI
|
||||
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
|
||||
MOVUPS (SI), X6
|
||||
MOVUPS (SI)(R8*1), X8
|
||||
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
|
||||
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
|
||||
|
||||
// X_(i+1) = { real(x[i], real(x[i]) }
|
||||
MOVDDUP_X2_X3
|
||||
MOVDDUP_X4_X5
|
||||
SHUFPD $0x3, X4, X4
|
||||
MOVDDUP_X6_X7
|
||||
SHUFPD $0x3, X6, X6
|
||||
MOVDDUP_X8_X9
|
||||
|
||||
// X_i = { imag(x[i]), imag(x[i]) }
|
||||
SHUFPD $0x3, X2, X2
|
||||
SHUFPD $0x3, X4, X4
|
||||
SHUFPD $0x3, X6, X6
|
||||
SHUFPD $0x3, X8, X8
|
||||
MULPD X1, X2 // (ai*xr, ar*xr)
|
||||
MULPD X0, X3 // (ar*xi, ai*xi)
|
||||
MULPD X11, X4
|
||||
MULPD X10, X5
|
||||
MULPD X1, X6
|
||||
MULPD X0, X7
|
||||
MULPD X11, X8
|
||||
MULPD X10, X9
|
||||
ADDSUBPD_X2_X3 // Add/Sub to (ai*xr + ar*xi , ar*xr - (ai*xi))
|
||||
|
||||
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
|
||||
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
|
||||
MULPD X1, X2
|
||||
MULPD X0, X3
|
||||
MULPD X11, X4
|
||||
MULPD X10, X5
|
||||
MULPD X1, X6
|
||||
MULPD X0, X7
|
||||
MULPD X11, X8
|
||||
MULPD X10, X9
|
||||
|
||||
// X_(i+1) = {
|
||||
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
|
||||
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
|
||||
// }
|
||||
ADDSUBPD_X2_X3
|
||||
ADDSUBPD_X4_X5
|
||||
ADDSUBPD_X6_X7
|
||||
ADDSUBPD_X8_X9
|
||||
|
||||
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
|
||||
ADDPD (DI), X3
|
||||
ADDPD (DI)(R9*1), X5
|
||||
MOVUPS X3, (DI) // Write result back to dst
|
||||
MOVUPS X3, (DI) // y[i] = X_(i+1)
|
||||
MOVUPS X5, (DI)(R9*1)
|
||||
LEAQ (DI)(R9*2), DI
|
||||
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
|
||||
ADDPD (DI), X7
|
||||
ADDPD (DI)(R9*1), X9
|
||||
MOVUPS X7, (DI) // Write result back to dst
|
||||
MOVUPS X7, (DI)
|
||||
MOVUPS X9, (DI)(R9*1)
|
||||
LEAQ (SI)(R8*2), SI // Increment addresses
|
||||
LEAQ (DI)(R9*2), DI
|
||||
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
|
||||
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
|
||||
DECQ BX
|
||||
JNZ axpyi_loop
|
||||
CMPQ CX, $0
|
||||
JNZ axpyi_loop // } while --BX > 0
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE axpyi_end
|
||||
|
||||
axpyi_tail:
|
||||
MOVUPS (SI), X2
|
||||
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
|
||||
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
|
||||
MULPD X1, X2 // (ai*x2r, ar*x2r, ai*x1r, ar*x1r)
|
||||
MULPD X0, X3 // (ar*x2i, ai*x2i, ar*x1i, ai*x1i)
|
||||
ADDSUBPD_X2_X3 // (ai*x2r+ar*x2i, ar*x2r-ai*x2i, ai*x1r+ar*x1i, ar*x1r-ai*x1i)
|
||||
axpyi_tail: // do {
|
||||
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
|
||||
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
|
||||
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
|
||||
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
|
||||
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
|
||||
|
||||
// X_(i+1) = {
|
||||
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
|
||||
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
|
||||
// }
|
||||
ADDSUBPD_X2_X3
|
||||
|
||||
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
|
||||
ADDPD (DI), X3
|
||||
MOVUPS X3, (DI)
|
||||
ADDQ R8, SI // Increment addresses
|
||||
ADDQ R9, DI
|
||||
LOOP axpyi_tail
|
||||
MOVUPS X3, (DI) // y[i] = X_i
|
||||
ADDQ R8, SI // SI = &(SI[incX])
|
||||
ADDQ R9, DI // DI = &(DI[incY])
|
||||
LOOP axpyi_tail // } while --CX > 0
|
||||
|
||||
axpyi_end:
|
||||
RET
|
||||
|
@@ -26,94 +26,116 @@
|
||||
|
||||
// func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
|
||||
TEXT ·AxpyIncTo(SB), NOSPLIT, $0
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ x_base+56(FP), SI
|
||||
MOVQ y_base+80(FP), DX
|
||||
MOVQ n+104(FP), CX
|
||||
CMPQ CX, $0 // if n==0, return
|
||||
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||
MOVQ x_base+56(FP), SI // SI := &x
|
||||
MOVQ y_base+80(FP), DX // DX := &y
|
||||
MOVQ n+104(FP), CX // CX = n
|
||||
CMPQ CX, $0 // if n==0 { return }
|
||||
JE axpyi_end
|
||||
MOVQ ix+128(FP), R8 // Load the first indicies
|
||||
MOVQ ix+128(FP), R8 // Load the first index
|
||||
SHLQ $1, R8 // Double to adjust for 16-byte size
|
||||
MOVQ iy+136(FP), R9
|
||||
SHLQ $1, R9
|
||||
MOVQ idst+32(FP), R10
|
||||
SHLQ $1, R10
|
||||
LEAQ (SI)(R8*8), SI // Calculate addrress of first indicies
|
||||
LEAQ (DX)(R9*8), DX
|
||||
LEAQ (DI)(R10*8), DI
|
||||
MOVQ incX+112(FP), R8 // Incrementors*8 for easy iteration (ADDQ)
|
||||
SHLQ $4, R8
|
||||
LEAQ (SI)(R8*8), SI // SI = &(x[ix])
|
||||
LEAQ (DX)(R9*8), DX // DX = &(y[iy])
|
||||
LEAQ (DI)(R10*8), DI // DI = &(dst[idst])
|
||||
MOVQ incX+112(FP), R8 // R8 = incX * sizeof(complex128)
|
||||
SHLQ $4, R8 // Incrementors*16 for easy iteration (ADDQ)
|
||||
MOVQ incY+120(FP), R9
|
||||
SHLQ $4, R9
|
||||
MOVQ incDst+24(FP), R10
|
||||
SHLQ $4, R10
|
||||
MOVUPS alpha+40(FP), X0 // (ar,ai)
|
||||
MOVUPS alpha+40(FP), X0 // X0 := { imag(a), real(a) }
|
||||
MOVAPS X0, X1
|
||||
SHUFPD $0x1, X1, X1 // (ai,ar)
|
||||
MOVAPS X0, X10
|
||||
SHUFPD $0x1, X1, X1 // X1 := { real(a), imag(a) }
|
||||
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
|
||||
MOVAPS X1, X11
|
||||
MOVQ CX, BX
|
||||
ANDQ $3, CX
|
||||
SHRQ $2, BX
|
||||
JZ axpyi_tail
|
||||
ANDQ $3, CX // CX = floor( CX / 4 )
|
||||
SHRQ $2, BX // BX = CX % 4
|
||||
JZ axpyi_tail // if BX == 0 { goto caxy_tail }
|
||||
|
||||
axpyi_loop:
|
||||
MOVUPS (SI), X2
|
||||
axpyi_loop: // do {
|
||||
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
|
||||
MOVUPS (SI)(R8*1), X4
|
||||
LEAQ (SI)(R8*2), SI
|
||||
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
|
||||
|
||||
MOVUPS (SI), X6
|
||||
MOVUPS (SI)(R8*1), X8
|
||||
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
|
||||
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
|
||||
|
||||
// X_(i+1) = { real(x[i], real(x[i]) }
|
||||
MOVDDUP_X2_X3
|
||||
MOVDDUP_X4_X5
|
||||
SHUFPD $0x3, X4, X4
|
||||
MOVDDUP_X6_X7
|
||||
SHUFPD $0x3, X6, X6
|
||||
MOVDDUP_X8_X9
|
||||
|
||||
// X_i = { imag(x[i]), imag(x[i]) }
|
||||
SHUFPD $0x3, X2, X2
|
||||
SHUFPD $0x3, X4, X4
|
||||
SHUFPD $0x3, X6, X6
|
||||
SHUFPD $0x3, X8, X8
|
||||
MULPD X1, X2 // (ai*xr, ar*xr)
|
||||
MULPD X0, X3 // (ar*xi, ai*xi)
|
||||
MULPD X11, X4
|
||||
MULPD X10, X5
|
||||
MULPD X1, X6
|
||||
MULPD X0, X7
|
||||
MULPD X11, X8
|
||||
MULPD X10, X9
|
||||
ADDSUBPD_X2_X3 // Add/Sub to (ai*xr + ar*xi , ar*xr - (ai*xi))
|
||||
|
||||
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
|
||||
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
|
||||
MULPD X1, X2
|
||||
MULPD X0, X3
|
||||
MULPD X11, X4
|
||||
MULPD X10, X5
|
||||
MULPD X1, X6
|
||||
MULPD X0, X7
|
||||
MULPD X11, X8
|
||||
MULPD X10, X9
|
||||
|
||||
// X_(i+1) = {
|
||||
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
|
||||
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
|
||||
// }
|
||||
ADDSUBPD_X2_X3
|
||||
ADDSUBPD_X4_X5
|
||||
ADDSUBPD_X6_X7
|
||||
ADDSUBPD_X8_X9
|
||||
|
||||
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
|
||||
ADDPD (DX), X3
|
||||
ADDPD (DX)(R9*1), X5
|
||||
LEAQ (DX)(R9*2), DX
|
||||
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
|
||||
ADDPD (DX), X7
|
||||
ADDPD (DX)(R9*1), X9
|
||||
MOVUPS X3, (DI) // Write result back to dst
|
||||
MOVUPS X3, (DI) // dst[i] = X_(i+1)
|
||||
MOVUPS X5, (DI)(R10*1)
|
||||
LEAQ (DI)(R10*2), DI
|
||||
MOVUPS X7, (DI)
|
||||
MOVUPS X9, (DI)(R10*1)
|
||||
LEAQ (SI)(R8*2), SI // Increment addresses
|
||||
LEAQ (DX)(R9*2), DX
|
||||
LEAQ (DI)(R10*2), DI
|
||||
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
|
||||
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
|
||||
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
|
||||
DECQ BX
|
||||
JNZ axpyi_loop
|
||||
CMPQ CX, $0
|
||||
JNZ axpyi_loop // } while --BX > 0
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE axpyi_end
|
||||
|
||||
axpyi_tail:
|
||||
MOVUPS (SI), X2
|
||||
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
|
||||
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
|
||||
MULPD X1, X2 // (ai*x2r, ar*x2r, ai*x1r, ar*x1r)
|
||||
MULPD X0, X3 // (ar*x2i, ai*x2i, ar*x1i, ai*x1i)
|
||||
ADDSUBPD_X2_X3 // (ai*x2r+ar*x2i, ar*x2r-ai*x2i, ai*x1r+ar*x1i, ar*x1r-ai*x1i)
|
||||
axpyi_tail: // do {
|
||||
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
|
||||
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
|
||||
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
|
||||
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
|
||||
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
|
||||
|
||||
// X_(i+1) = {
|
||||
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
|
||||
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
|
||||
// }
|
||||
ADDSUBPD_X2_X3
|
||||
|
||||
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
|
||||
ADDPD (DX), X3
|
||||
MOVUPS X3, (DI)
|
||||
ADDQ R8, SI // Increment addresses
|
||||
ADDQ R9, DX
|
||||
ADDQ R10, DI
|
||||
LOOP axpyi_tail
|
||||
MOVUPS X3, (DI) // y[i] X_(i+1)
|
||||
ADDQ R8, SI // SI += incX
|
||||
ADDQ R9, DX // DX += incY
|
||||
ADDQ R10, DI // DI += incDst
|
||||
LOOP axpyi_tail // } while --CX > 0
|
||||
|
||||
axpyi_end:
|
||||
RET
|
||||
|
@@ -26,76 +26,97 @@
|
||||
|
||||
// func AxpyUnitary(alpha complex128, x, y []complex128)
|
||||
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
|
||||
MOVQ x_base+16(FP), SI
|
||||
MOVQ y_base+40(FP), DI
|
||||
MOVQ x_len+24(FP), CX
|
||||
MOVQ x_base+16(FP), SI // SI := &x
|
||||
MOVQ y_base+40(FP), DI // DI := &y
|
||||
MOVQ x_len+24(FP), CX // CX := min( len(x), len(y) )
|
||||
CMPQ y_len+48(FP), CX
|
||||
CMOVQLE y_len+48(FP), CX
|
||||
CMPQ CX, $0
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE caxy_end
|
||||
PXOR X0, X0 // Clear work registers and cache-align loop
|
||||
PXOR X1, X1
|
||||
MOVUPS alpha+0(FP), X0 // (ar,ai)
|
||||
MOVUPS alpha+0(FP), X0 // X0 := { imag(a), real(a) }
|
||||
MOVAPS X0, X1
|
||||
SHUFPD $0x1, X1, X1 // (ai,ar)
|
||||
XORQ AX, AX
|
||||
MOVAPS X0, X10
|
||||
SHUFPD $0x1, X1, X1 // X1 := { real(a), imag(a) }
|
||||
XORQ AX, AX // i := 0
|
||||
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
|
||||
MOVAPS X1, X11
|
||||
MOVQ CX, BX
|
||||
ANDQ $3, CX
|
||||
SHRQ $2, BX
|
||||
JZ caxy_tail
|
||||
ANDQ $3, CX // CX = floor( CX / 4 )
|
||||
SHRQ $2, BX // BX = CX % 4
|
||||
JZ caxy_tail // if BX == 0 { goto caxy_tail }
|
||||
|
||||
caxy_loop:
|
||||
MOVUPS (SI)(AX*8), X2
|
||||
caxy_loop: // do {
|
||||
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
|
||||
MOVUPS 16(SI)(AX*8), X4
|
||||
MOVUPS 32(SI)(AX*8), X6
|
||||
MOVUPS 48(SI)(AX*8), X8
|
||||
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
|
||||
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
|
||||
|
||||
// X_(i+1) = { real(x[i], real(x[i]) }
|
||||
MOVDDUP_X2_X3
|
||||
MOVDDUP_X4_X5
|
||||
SHUFPD $0x3, X4, X4
|
||||
MOVDDUP_X6_X7
|
||||
SHUFPD $0x3, X6, X6
|
||||
MOVDDUP_X8_X9
|
||||
|
||||
// X_i = { imag(x[i]), imag(x[i]) }
|
||||
SHUFPD $0x3, X2, X2
|
||||
SHUFPD $0x3, X4, X4
|
||||
SHUFPD $0x3, X6, X6
|
||||
SHUFPD $0x3, X8, X8
|
||||
MULPD X1, X2 // (ai*xr, ar*xr)
|
||||
MULPD X0, X3 // (ar*xi, ai*xi)
|
||||
MULPD X11, X4
|
||||
MULPD X10, X5
|
||||
MULPD X1, X6
|
||||
MULPD X0, X7
|
||||
MULPD X11, X8
|
||||
MULPD X10, X9
|
||||
ADDSUBPD_X2_X3 // Add/Sub to (ai*xr + ar*xi , ar*xr - (ai*xi))
|
||||
|
||||
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
|
||||
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
|
||||
MULPD X1, X2
|
||||
MULPD X0, X3
|
||||
MULPD X11, X4
|
||||
MULPD X10, X5
|
||||
MULPD X1, X6
|
||||
MULPD X0, X7
|
||||
MULPD X11, X8
|
||||
MULPD X10, X9
|
||||
|
||||
// X_(i+1) = {
|
||||
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
|
||||
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
|
||||
// }
|
||||
ADDSUBPD_X2_X3
|
||||
ADDSUBPD_X4_X5
|
||||
ADDSUBPD_X6_X7
|
||||
ADDSUBPD_X8_X9
|
||||
ADDPD (DI)(AX*8), X3 // Add y2,y1 to a*(x2,x1)
|
||||
|
||||
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
|
||||
ADDPD (DI)(AX*8), X3
|
||||
ADDPD 16(DI)(AX*8), X5
|
||||
ADDPD 32(DI)(AX*8), X7
|
||||
ADDPD 48(DI)(AX*8), X9
|
||||
MOVUPS X3, (DI)(AX*8) // Write result back to y2,y1
|
||||
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
|
||||
MOVUPS X5, 16(DI)(AX*8)
|
||||
MOVUPS X7, 32(DI)(AX*8)
|
||||
MOVUPS X9, 48(DI)(AX*8)
|
||||
ADDQ $8, AX
|
||||
ADDQ $8, AX // i += 8
|
||||
DECQ BX
|
||||
JNZ caxy_loop
|
||||
CMPQ CX, $0
|
||||
JNZ caxy_loop // } while --BX > 0
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE caxy_end
|
||||
|
||||
caxy_tail:
|
||||
MOVUPS (SI)(AX*8), X2
|
||||
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
|
||||
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
|
||||
MULPD X1, X2 // (ai*x2r, ar*x2r, ai*x1r, ar*x1r)
|
||||
MULPD X0, X3 // (ar*x2i, ai*x2i, ar*x1i, ai*x1i)
|
||||
ADDSUBPD_X2_X3 // (ai*x2r+ar*x2i, ar*x2r-ai*x2i, ai*x1r+ar*x1i, ar*x1r-ai*x1i)
|
||||
ADDPD (DI)(AX*8), X3 // */
|
||||
MOVUPS X3, (DI)(AX*8)
|
||||
ADDQ $2, AX
|
||||
LOOP caxy_tail
|
||||
caxy_tail: // do {
|
||||
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
|
||||
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
|
||||
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
|
||||
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
|
||||
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
|
||||
|
||||
// X_(i+1) = {
|
||||
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
|
||||
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
|
||||
// }
|
||||
ADDSUBPD_X2_X3
|
||||
|
||||
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
|
||||
ADDPD (DI)(AX*8), X3
|
||||
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
|
||||
ADDQ $2, AX // i += 2
|
||||
LOOP caxy_tail // } while --CX > 0
|
||||
|
||||
caxy_end:
|
||||
RET
|
||||
|
@@ -26,77 +26,98 @@
|
||||
|
||||
// func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128)
|
||||
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ x_base+40(FP), SI
|
||||
MOVQ y_base+64(FP), DX
|
||||
MOVQ x_len+48(FP), CX
|
||||
MOVQ dst_base+0(FP), DI // DI := &dst
|
||||
MOVQ x_base+40(FP), SI // SI := &x
|
||||
MOVQ y_base+64(FP), DX // DX := &y
|
||||
MOVQ x_len+48(FP), CX // CX := min( len(x), len(y), len(dst) )
|
||||
CMPQ y_len+72(FP), CX
|
||||
CMOVQLE y_len+72(FP), CX
|
||||
CMPQ dst_len+8(FP), CX
|
||||
CMOVQLE dst_len+8(FP), CX
|
||||
CMPQ CX, $0
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE caxy_end
|
||||
MOVUPS alpha+24(FP), X0 // (ar,ai)
|
||||
MOVUPS alpha+24(FP), X0 // X0 := { imag(a), real(a) }
|
||||
MOVAPS X0, X1
|
||||
SHUFPD $0x1, X1, X1 // (ai,ar)
|
||||
XORQ AX, AX
|
||||
MOVAPS X0, X10
|
||||
SHUFPD $0x1, X1, X1 // X1 := { real(a), imag(a) }
|
||||
XORQ AX, AX // i := 0
|
||||
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
|
||||
MOVAPS X1, X11
|
||||
MOVQ CX, BX
|
||||
ANDQ $3, CX
|
||||
SHRQ $2, BX
|
||||
JZ caxy_tail
|
||||
ANDQ $3, CX // CX = floor( CX / 4 )
|
||||
SHRQ $2, BX // BX = CX % 4
|
||||
JZ caxy_tail // if BX == 0 { goto caxy_tail }
|
||||
|
||||
caxy_loop:
|
||||
MOVUPS (SI)(AX*8), X2
|
||||
caxy_loop: // do {
|
||||
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
|
||||
MOVUPS 16(SI)(AX*8), X4
|
||||
MOVUPS 32(SI)(AX*8), X6
|
||||
MOVUPS 48(SI)(AX*8), X8
|
||||
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
|
||||
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
|
||||
|
||||
// X_(i+1) = { real(x[i], real(x[i]) }
|
||||
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
|
||||
MOVDDUP_X4_X5
|
||||
SHUFPD $0x3, X4, X4
|
||||
MOVDDUP_X6_X7
|
||||
SHUFPD $0x3, X6, X6
|
||||
MOVDDUP_X8_X9
|
||||
|
||||
// X_i = { imag(x[i]), imag(x[i]) }
|
||||
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
|
||||
SHUFPD $0x3, X4, X4
|
||||
SHUFPD $0x3, X6, X6
|
||||
SHUFPD $0x3, X8, X8
|
||||
MULPD X1, X2 // (ai*xr, ar*xr)
|
||||
MULPD X0, X3 // (ar*xi, ai*xi)
|
||||
MULPD X11, X4
|
||||
MULPD X10, X5
|
||||
MULPD X1, X6
|
||||
MULPD X0, X7
|
||||
MULPD X11, X8
|
||||
MULPD X10, X9
|
||||
ADDSUBPD_X2_X3 // Add/Sub to (ai*xr + ar*xi , ar*xr - (ai*xi))
|
||||
|
||||
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
|
||||
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
|
||||
MULPD X1, X2
|
||||
MULPD X0, X3
|
||||
MULPD X11, X4
|
||||
MULPD X10, X5
|
||||
MULPD X1, X6
|
||||
MULPD X0, X7
|
||||
MULPD X11, X8
|
||||
MULPD X10, X9
|
||||
|
||||
// X_(i+1) = {
|
||||
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
|
||||
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
|
||||
// }
|
||||
ADDSUBPD_X2_X3
|
||||
ADDSUBPD_X4_X5
|
||||
ADDSUBPD_X6_X7
|
||||
ADDSUBPD_X8_X9
|
||||
ADDPD (DX)(AX*8), X3 // Add y2,y1 to a*(x2,x1)
|
||||
|
||||
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
|
||||
ADDPD (DX)(AX*8), X3
|
||||
ADDPD 16(DX)(AX*8), X5
|
||||
ADDPD 32(DX)(AX*8), X7
|
||||
ADDPD 48(DX)(AX*8), X9
|
||||
MOVUPS X3, (DI)(AX*8) // Write result back to y2,y1
|
||||
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
|
||||
MOVUPS X5, 16(DI)(AX*8)
|
||||
MOVUPS X7, 32(DI)(AX*8)
|
||||
MOVUPS X9, 48(DI)(AX*8)
|
||||
ADDQ $8, AX
|
||||
ADDQ $8, AX // i += 8
|
||||
DECQ BX
|
||||
JNZ caxy_loop
|
||||
CMPQ CX, $0
|
||||
JNZ caxy_loop // } while --BX > 0
|
||||
CMPQ CX, $0 // if CX == 0 { return }
|
||||
JE caxy_end
|
||||
|
||||
caxy_tail: // Same calculation, but read in values to avoid trampling memory
|
||||
MOVUPS (SI)(AX*8), X2
|
||||
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
|
||||
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
|
||||
MULPD X1, X2 // (ai*x2r, ar*x2r, ai*x1r, ar*x1r)
|
||||
MULPD X0, X3 // (ar*x2i, ai*x2i, ar*x1i, ai*x1i)
|
||||
ADDSUBPD_X2_X3 // (ai*x2r+ar*x2i, ar*x2r-ai*x2i, ai*x1r+ar*x1i, ar*x1r-ai*x1i)
|
||||
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
|
||||
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
|
||||
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
|
||||
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
|
||||
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
|
||||
|
||||
// X_(i+1) = {
|
||||
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
|
||||
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
|
||||
// }
|
||||
ADDSUBPD_X2_X3
|
||||
|
||||
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
|
||||
ADDPD (DX)(AX*8), X3
|
||||
MOVUPS X3, (DI)(AX*8)
|
||||
ADDQ $2, AX
|
||||
LOOP caxy_tail
|
||||
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
|
||||
ADDQ $2, AX // i += 2
|
||||
LOOP caxy_tail // } while --CX > 0
|
||||
|
||||
caxy_end:
|
||||
RET
|
||||
|
Reference in New Issue
Block a user