asm/c128: Commenting asm code

This commit is contained in:
Chad Kunde
2016-06-20 12:23:19 -07:00
parent 022e159e89
commit c5df796543
4 changed files with 267 additions and 182 deletions

View File

@@ -26,85 +26,106 @@
// func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyInc(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SI
MOVQ y_base+40(FP), DI
MOVQ n+64(FP), CX
CMPQ CX, $0 // if n==0, return
MOVQ x_base+16(FP), SI // SI := &x
MOVQ y_base+40(FP), DI // DI := y
MOVQ n+64(FP), CX // CX := n
CMPQ CX, $0 // if n==0 { return }
JE axpyi_end
MOVQ ix+88(FP), R8 // Load the first indicies
MOVQ ix+88(FP), R8 // Load the first index
SHLQ $1, R8 // Double to adjust for 16-byte size
MOVQ iy+96(FP), R9
SHLQ $1, R9
LEAQ (SI)(R8*8), SI // Calculate addrress of first indicies
LEAQ (DI)(R9*8), DI
LEAQ (SI)(R8*8), SI // SI = &(x[ix])
LEAQ (DI)(R9*8), DI // DI = &(y[iy])
MOVQ incX+72(FP), R8 // Incrementors*16 for easy iteration (ADDQ)
SHLQ $4, R8
MOVQ incY+80(FP), R9
SHLQ $4, R9
MOVUPS alpha+0(FP), X0 // (ar,ai)
MOVUPS alpha+0(FP), X0 // X0 := { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // (ai,ar)
MOVAPS X0, X10
SHUFPD $0x1, X1, X1 // X1 := { real(a), imag(a) }
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX
SHRQ $2, BX
JZ axpyi_tail
ANDQ $3, CX // CX = floor( CX / 4 )
SHRQ $2, BX // BX = CX % 4
JZ axpyi_tail // if BX == 0 { goto caxy_tail }
axpyi_loop:
MOVUPS (SI), X2
axpyi_loop: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS (SI)(R8*1), X4
LEAQ (SI)(R8*2), SI
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVUPS (SI), X6
MOVUPS (SI)(R8*1), X8
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
SHUFPD $0x3, X4, X4
MOVDDUP_X6_X7
SHUFPD $0x3, X6, X6
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
MULPD X1, X2 // (ai*xr, ar*xr)
MULPD X0, X3 // (ar*xi, ai*xi)
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
ADDSUBPD_X2_X3 // Add/Sub to (ai*xr + ar*xi , ar*xr - (ai*xi))
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI), X3
ADDPD (DI)(R9*1), X5
MOVUPS X3, (DI) // Write result back to dst
MOVUPS X3, (DI) // y[i] = X_(i+1)
MOVUPS X5, (DI)(R9*1)
LEAQ (DI)(R9*2), DI
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
ADDPD (DI), X7
ADDPD (DI)(R9*1), X9
MOVUPS X7, (DI) // Write result back to dst
MOVUPS X7, (DI)
MOVUPS X9, (DI)(R9*1)
LEAQ (SI)(R8*2), SI // Increment addresses
LEAQ (DI)(R9*2), DI
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
DECQ BX
JNZ axpyi_loop
CMPQ CX, $0
JNZ axpyi_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE axpyi_end
axpyi_tail:
MOVUPS (SI), X2
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
MULPD X1, X2 // (ai*x2r, ar*x2r, ai*x1r, ar*x1r)
MULPD X0, X3 // (ar*x2i, ai*x2i, ar*x1i, ai*x1i)
ADDSUBPD_X2_X3 // (ai*x2r+ar*x2i, ar*x2r-ai*x2i, ai*x1r+ar*x1i, ar*x1r-ai*x1i)
axpyi_tail: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI), X3
MOVUPS X3, (DI)
ADDQ R8, SI // Increment addresses
ADDQ R9, DI
LOOP axpyi_tail
MOVUPS X3, (DI) // y[i] = X_i
ADDQ R8, SI // SI = &(SI[incX])
ADDQ R9, DI // DI = &(DI[incY])
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -26,94 +26,116 @@
// func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyIncTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI
MOVQ x_base+56(FP), SI
MOVQ y_base+80(FP), DX
MOVQ n+104(FP), CX
CMPQ CX, $0 // if n==0, return
MOVQ dst_base+0(FP), DI // DI := &dst
MOVQ x_base+56(FP), SI // SI := &x
MOVQ y_base+80(FP), DX // DX := &y
MOVQ n+104(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JE axpyi_end
MOVQ ix+128(FP), R8 // Load the first indicies
MOVQ ix+128(FP), R8 // Load the first index
SHLQ $1, R8 // Double to adjust for 16-byte size
MOVQ iy+136(FP), R9
SHLQ $1, R9
MOVQ idst+32(FP), R10
SHLQ $1, R10
LEAQ (SI)(R8*8), SI // Calculate addrress of first indicies
LEAQ (DX)(R9*8), DX
LEAQ (DI)(R10*8), DI
MOVQ incX+112(FP), R8 // Incrementors*8 for easy iteration (ADDQ)
SHLQ $4, R8
LEAQ (SI)(R8*8), SI // SI = &(x[ix])
LEAQ (DX)(R9*8), DX // DX = &(y[iy])
LEAQ (DI)(R10*8), DI // DI = &(dst[idst])
MOVQ incX+112(FP), R8 // R8 = incX * sizeof(complex128)
SHLQ $4, R8 // Incrementors*16 for easy iteration (ADDQ)
MOVQ incY+120(FP), R9
SHLQ $4, R9
MOVQ incDst+24(FP), R10
SHLQ $4, R10
MOVUPS alpha+40(FP), X0 // (ar,ai)
MOVUPS alpha+40(FP), X0 // X0 := { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // (ai,ar)
MOVAPS X0, X10
SHUFPD $0x1, X1, X1 // X1 := { real(a), imag(a) }
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX
SHRQ $2, BX
JZ axpyi_tail
ANDQ $3, CX // CX = floor( CX / 4 )
SHRQ $2, BX // BX = CX % 4
JZ axpyi_tail // if BX == 0 { goto caxy_tail }
axpyi_loop:
MOVUPS (SI), X2
axpyi_loop: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS (SI)(R8*1), X4
LEAQ (SI)(R8*2), SI
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVUPS (SI), X6
MOVUPS (SI)(R8*1), X8
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
SHUFPD $0x3, X4, X4
MOVDDUP_X6_X7
SHUFPD $0x3, X6, X6
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
MULPD X1, X2 // (ai*xr, ar*xr)
MULPD X0, X3 // (ar*xi, ai*xi)
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
ADDSUBPD_X2_X3 // Add/Sub to (ai*xr + ar*xi , ar*xr - (ai*xi))
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX), X3
ADDPD (DX)(R9*1), X5
LEAQ (DX)(R9*2), DX
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
ADDPD (DX), X7
ADDPD (DX)(R9*1), X9
MOVUPS X3, (DI) // Write result back to dst
MOVUPS X3, (DI) // dst[i] = X_(i+1)
MOVUPS X5, (DI)(R10*1)
LEAQ (DI)(R10*2), DI
MOVUPS X7, (DI)
MOVUPS X9, (DI)(R10*1)
LEAQ (SI)(R8*2), SI // Increment addresses
LEAQ (DX)(R9*2), DX
LEAQ (DI)(R10*2), DI
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
DECQ BX
JNZ axpyi_loop
CMPQ CX, $0
JNZ axpyi_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE axpyi_end
axpyi_tail:
MOVUPS (SI), X2
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
MULPD X1, X2 // (ai*x2r, ar*x2r, ai*x1r, ar*x1r)
MULPD X0, X3 // (ar*x2i, ai*x2i, ar*x1i, ai*x1i)
ADDSUBPD_X2_X3 // (ai*x2r+ar*x2i, ar*x2r-ai*x2i, ai*x1r+ar*x1i, ar*x1r-ai*x1i)
axpyi_tail: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX), X3
MOVUPS X3, (DI)
ADDQ R8, SI // Increment addresses
ADDQ R9, DX
ADDQ R10, DI
LOOP axpyi_tail
MOVUPS X3, (DI) // y[i] X_(i+1)
ADDQ R8, SI // SI += incX
ADDQ R9, DX // DX += incY
ADDQ R10, DI // DI += incDst
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -26,76 +26,97 @@
// func AxpyUnitary(alpha complex128, x, y []complex128)
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SI
MOVQ y_base+40(FP), DI
MOVQ x_len+24(FP), CX
MOVQ x_base+16(FP), SI // SI := &x
MOVQ y_base+40(FP), DI // DI := &y
MOVQ x_len+24(FP), CX // CX := min( len(x), len(y) )
CMPQ y_len+48(FP), CX
CMOVQLE y_len+48(FP), CX
CMPQ CX, $0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
PXOR X0, X0 // Clear work registers and cache-align loop
PXOR X1, X1
MOVUPS alpha+0(FP), X0 // (ar,ai)
MOVUPS alpha+0(FP), X0 // X0 := { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // (ai,ar)
XORQ AX, AX
MOVAPS X0, X10
SHUFPD $0x1, X1, X1 // X1 := { real(a), imag(a) }
XORQ AX, AX // i := 0
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX
SHRQ $2, BX
JZ caxy_tail
ANDQ $3, CX // CX = floor( CX / 4 )
SHRQ $2, BX // BX = CX % 4
JZ caxy_tail // if BX == 0 { goto caxy_tail }
caxy_loop:
MOVUPS (SI)(AX*8), X2
caxy_loop: // do {
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS 16(SI)(AX*8), X4
MOVUPS 32(SI)(AX*8), X6
MOVUPS 48(SI)(AX*8), X8
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
SHUFPD $0x3, X4, X4
MOVDDUP_X6_X7
SHUFPD $0x3, X6, X6
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
MULPD X1, X2 // (ai*xr, ar*xr)
MULPD X0, X3 // (ar*xi, ai*xi)
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
ADDSUBPD_X2_X3 // Add/Sub to (ai*xr + ar*xi , ar*xr - (ai*xi))
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
ADDPD (DI)(AX*8), X3 // Add y2,y1 to a*(x2,x1)
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI)(AX*8), X3
ADDPD 16(DI)(AX*8), X5
ADDPD 32(DI)(AX*8), X7
ADDPD 48(DI)(AX*8), X9
MOVUPS X3, (DI)(AX*8) // Write result back to y2,y1
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
MOVUPS X5, 16(DI)(AX*8)
MOVUPS X7, 32(DI)(AX*8)
MOVUPS X9, 48(DI)(AX*8)
ADDQ $8, AX
ADDQ $8, AX // i += 8
DECQ BX
JNZ caxy_loop
CMPQ CX, $0
JNZ caxy_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
caxy_tail:
MOVUPS (SI)(AX*8), X2
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
MULPD X1, X2 // (ai*x2r, ar*x2r, ai*x1r, ar*x1r)
MULPD X0, X3 // (ar*x2i, ai*x2i, ar*x1i, ai*x1i)
ADDSUBPD_X2_X3 // (ai*x2r+ar*x2i, ar*x2r-ai*x2i, ai*x1r+ar*x1i, ar*x1r-ai*x1i)
ADDPD (DI)(AX*8), X3 // */
MOVUPS X3, (DI)(AX*8)
ADDQ $2, AX
LOOP caxy_tail
caxy_tail: // do {
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI)(AX*8), X3
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
ADDQ $2, AX // i += 2
LOOP caxy_tail // } while --CX > 0
caxy_end:
RET

View File

@@ -26,77 +26,98 @@
// func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128)
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI
MOVQ x_base+40(FP), SI
MOVQ y_base+64(FP), DX
MOVQ x_len+48(FP), CX
MOVQ dst_base+0(FP), DI // DI := &dst
MOVQ x_base+40(FP), SI // SI := &x
MOVQ y_base+64(FP), DX // DX := &y
MOVQ x_len+48(FP), CX // CX := min( len(x), len(y), len(dst) )
CMPQ y_len+72(FP), CX
CMOVQLE y_len+72(FP), CX
CMPQ dst_len+8(FP), CX
CMOVQLE dst_len+8(FP), CX
CMPQ CX, $0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
MOVUPS alpha+24(FP), X0 // (ar,ai)
MOVUPS alpha+24(FP), X0 // X0 := { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // (ai,ar)
XORQ AX, AX
MOVAPS X0, X10
SHUFPD $0x1, X1, X1 // X1 := { real(a), imag(a) }
XORQ AX, AX // i := 0
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX
SHRQ $2, BX
JZ caxy_tail
ANDQ $3, CX // CX = floor( CX / 4 )
SHRQ $2, BX // BX = CX % 4
JZ caxy_tail // if BX == 0 { goto caxy_tail }
caxy_loop:
MOVUPS (SI)(AX*8), X2
caxy_loop: // do {
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS 16(SI)(AX*8), X4
MOVUPS 32(SI)(AX*8), X6
MOVUPS 48(SI)(AX*8), X8
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
MOVDDUP_X4_X5
SHUFPD $0x3, X4, X4
MOVDDUP_X6_X7
SHUFPD $0x3, X6, X6
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
MULPD X1, X2 // (ai*xr, ar*xr)
MULPD X0, X3 // (ar*xi, ai*xi)
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
ADDSUBPD_X2_X3 // Add/Sub to (ai*xr + ar*xi , ar*xr - (ai*xi))
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
ADDPD (DX)(AX*8), X3 // Add y2,y1 to a*(x2,x1)
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX)(AX*8), X3
ADDPD 16(DX)(AX*8), X5
ADDPD 32(DX)(AX*8), X7
ADDPD 48(DX)(AX*8), X9
MOVUPS X3, (DI)(AX*8) // Write result back to y2,y1
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
MOVUPS X5, 16(DI)(AX*8)
MOVUPS X7, 32(DI)(AX*8)
MOVUPS X9, 48(DI)(AX*8)
ADDQ $8, AX
ADDQ $8, AX // i += 8
DECQ BX
JNZ caxy_loop
CMPQ CX, $0
JNZ caxy_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
caxy_tail: // Same calculation, but read in values to avoid trampling memory
MOVUPS (SI)(AX*8), X2
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
MULPD X1, X2 // (ai*x2r, ar*x2r, ai*x1r, ar*x1r)
MULPD X0, X3 // (ar*x2i, ai*x2i, ar*x1i, ai*x1i)
ADDSUBPD_X2_X3 // (ai*x2r+ar*x2i, ar*x2r-ai*x2i, ai*x1r+ar*x1i, ar*x1r-ai*x1i)
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX)(AX*8), X3
MOVUPS X3, (DI)(AX*8)
ADDQ $2, AX
LOOP caxy_tail
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
ADDQ $2, AX // i += 2
LOOP caxy_tail // } while --CX > 0
caxy_end:
RET