mirror of
https://github.com/gonum/gonum.git
synced 2025-10-16 04:00:48 +08:00
746 lines
12 KiB
ArmAsm
746 lines
12 KiB
ArmAsm
// Copyright ©2017 The gonum Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
//+build !noasm,!appengine,!safe
|
|
|
|
#include "textflag.h"
|
|
|
|
#define SIZE 8
|
|
|
|
#define M_DIM n+8(FP)
|
|
#define M CX
|
|
#define N_DIM m+0(FP)
|
|
#define N BX
|
|
|
|
#define TMP1 R14
|
|
#define TMP2 R15
|
|
|
|
#define X_PTR SI
|
|
#define X x_base+56(FP)
|
|
#define Y_PTR DX
|
|
#define Y y_base+96(FP)
|
|
#define A_ROW AX
|
|
#define A_PTR DI
|
|
|
|
#define INC_X R8
|
|
#define INC3_X R9
|
|
|
|
#define INC_Y R10
|
|
#define INC3_Y R11
|
|
|
|
#define LDA R12
|
|
#define LDA3 R13
|
|
|
|
#define ALPHA X15
|
|
#define BETA X14
|
|
|
|
#define INIT4 \
|
|
MOVDDUP (X_PTR), X8 \
|
|
MOVDDUP (X_PTR)(INC_X*1), X9 \
|
|
MOVDDUP (X_PTR)(INC_X*2), X10 \
|
|
MOVDDUP (X_PTR)(INC3_X*1), X11 \
|
|
MULPD ALPHA, X8 \
|
|
MULPD ALPHA, X9 \
|
|
MULPD ALPHA, X10 \
|
|
MULPD ALPHA, X11
|
|
|
|
#define INIT2 \
|
|
MOVDDUP (X_PTR), X8 \
|
|
MOVDDUP (X_PTR)(INC_X*1), X9 \
|
|
MULPD ALPHA, X8 \
|
|
MULPD ALPHA, X9
|
|
|
|
#define INIT1 \
|
|
MOVDDUP (X_PTR), X8 \
|
|
MULPD ALPHA, X8
|
|
|
|
#define KERNEL_LOAD4 \
|
|
MOVUPS (Y_PTR), X0 \
|
|
MOVUPS 2*SIZE(Y_PTR), X1
|
|
|
|
#define KERNEL_LOAD2 \
|
|
MOVUPS (Y_PTR), X0
|
|
|
|
#define KERNEL_LOAD4_INC \
|
|
MOVSD (Y_PTR), X0 \
|
|
MOVHPD (Y_PTR)(INC_Y*1), X0 \
|
|
MOVSD (Y_PTR)(INC_Y*2), X1 \
|
|
MOVHPD (Y_PTR)(INC3_Y*1), X1
|
|
|
|
#define KERNEL_LOAD2_INC \
|
|
MOVSD (Y_PTR), X0 \
|
|
MOVHPD (Y_PTR)(INC_Y*1), X0
|
|
|
|
#define KERNEL_4x4 \
|
|
MOVUPS (A_PTR), X4 \
|
|
MOVUPS 2*SIZE(A_PTR), X5 \
|
|
MOVUPS (A_PTR)(LDA*1), X6 \
|
|
MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \
|
|
MULPD X8, X4 \
|
|
MULPD X8, X5 \
|
|
MULPD X9, X6 \
|
|
MULPD X9, X7 \
|
|
ADDPD X4, X0 \
|
|
ADDPD X5, X1 \
|
|
ADDPD X6, X0 \
|
|
ADDPD X7, X1 \
|
|
MOVUPS (A_PTR)(LDA*2), X4 \
|
|
MOVUPS 2*SIZE(A_PTR)(LDA*2), X5 \
|
|
MOVUPS (A_PTR)(LDA3*1), X6 \
|
|
MOVUPS 2*SIZE(A_PTR)(LDA3*1), X7 \
|
|
MULPD X10, X4 \
|
|
MULPD X10, X5 \
|
|
MULPD X11, X6 \
|
|
MULPD X11, X7 \
|
|
ADDPD X4, X0 \
|
|
ADDPD X5, X1 \
|
|
ADDPD X6, X0 \
|
|
ADDPD X7, X1 \
|
|
ADDQ $4*SIZE, A_PTR
|
|
|
|
#define KERNEL_4x2 \
|
|
MOVUPS (A_PTR), X4 \
|
|
MOVUPS 2*SIZE(A_PTR), X5 \
|
|
MOVUPS (A_PTR)(LDA*1), X6 \
|
|
MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \
|
|
MULPD X8, X4 \
|
|
MULPD X8, X5 \
|
|
MULPD X9, X6 \
|
|
MULPD X9, X7 \
|
|
ADDPD X4, X0 \
|
|
ADDPD X5, X1 \
|
|
ADDPD X6, X0 \
|
|
ADDPD X7, X1 \
|
|
ADDQ $4*SIZE, A_PTR
|
|
|
|
#define KERNEL_4x1 \
|
|
MOVUPS (A_PTR), X4 \
|
|
MOVUPS 2*SIZE(A_PTR), X5 \
|
|
MULPD X8, X4 \
|
|
MULPD X8, X5 \
|
|
ADDPD X4, X0 \
|
|
ADDPD X5, X1 \
|
|
ADDQ $4*SIZE, A_PTR
|
|
|
|
#define STORE4 \
|
|
MOVUPS X0, (Y_PTR) \
|
|
MOVUPS X1, 2*SIZE(Y_PTR)
|
|
|
|
#define STORE4_INC \
|
|
MOVLPD X0, (Y_PTR) \
|
|
MOVHPD X0, (Y_PTR)(INC_Y*1) \
|
|
MOVLPD X1, (Y_PTR)(INC_Y*2) \
|
|
MOVHPD X1, (Y_PTR)(INC3_Y*1)
|
|
|
|
#define KERNEL_2x4 \
|
|
MOVUPS (A_PTR), X4 \
|
|
MOVUPS (A_PTR)(LDA*1), X5 \
|
|
MOVUPS (A_PTR)(LDA*2), X6 \
|
|
MOVUPS (A_PTR)(LDA3*1), X7 \
|
|
MULPD X8, X4 \
|
|
MULPD X9, X5 \
|
|
MULPD X10, X6 \
|
|
MULPD X11, X7 \
|
|
ADDPD X4, X0 \
|
|
ADDPD X5, X0 \
|
|
ADDPD X6, X0 \
|
|
ADDPD X7, X0 \
|
|
ADDQ $2*SIZE, A_PTR
|
|
|
|
#define KERNEL_2x2 \
|
|
MOVUPS (A_PTR), X4 \
|
|
MOVUPS (A_PTR)(LDA*1), X5 \
|
|
MULPD X8, X4 \
|
|
MULPD X9, X5 \
|
|
ADDPD X4, X0 \
|
|
ADDPD X5, X0 \
|
|
ADDQ $2*SIZE, A_PTR
|
|
|
|
#define KERNEL_2x1 \
|
|
MOVUPS (A_PTR), X4 \
|
|
MULPD X8, X4 \
|
|
ADDPD X4, X0 \
|
|
ADDQ $2*SIZE, A_PTR
|
|
|
|
#define STORE2 \
|
|
MOVUPS X0, (Y_PTR)
|
|
|
|
#define STORE2_INC \
|
|
MOVLPD X0, (Y_PTR) \
|
|
MOVHPD X0, (Y_PTR)(INC_Y*1)
|
|
|
|
#define KERNEL_1x4 \
|
|
MOVSD (Y_PTR), X0 \
|
|
MOVSD (A_PTR), X4 \
|
|
MOVSD (A_PTR)(LDA*1), X5 \
|
|
MOVSD (A_PTR)(LDA*2), X6 \
|
|
MOVSD (A_PTR)(LDA3*1), X7 \
|
|
MULSD X8, X4 \
|
|
MULSD X9, X5 \
|
|
MULSD X10, X6 \
|
|
MULSD X11, X7 \
|
|
ADDSD X4, X0 \
|
|
ADDSD X5, X0 \
|
|
ADDSD X6, X0 \
|
|
ADDSD X7, X0 \
|
|
MOVSD X0, (Y_PTR) \
|
|
ADDQ $SIZE, A_PTR
|
|
|
|
#define KERNEL_1x2 \
|
|
MOVSD (Y_PTR), X0 \
|
|
MOVSD (A_PTR), X4 \
|
|
MOVSD (A_PTR)(LDA*1), X5 \
|
|
MULSD X8, X4 \
|
|
MULSD X9, X5 \
|
|
ADDSD X4, X0 \
|
|
ADDSD X5, X0 \
|
|
MOVSD X0, (Y_PTR) \
|
|
ADDQ $SIZE, A_PTR
|
|
|
|
#define KERNEL_1x1 \
|
|
MOVSD (Y_PTR), X0 \
|
|
MOVSD (A_PTR), X4 \
|
|
MULSD X8, X4 \
|
|
ADDSD X4, X0 \
|
|
MOVSD X0, (Y_PTR) \
|
|
ADDQ $SIZE, A_PTR
|
|
|
|
#define SCALE_8(PTR, SCAL) \
|
|
MOVUPS (PTR), X0 \
|
|
MOVUPS 16(PTR), X1 \
|
|
MOVUPS 32(PTR), X2 \
|
|
MOVUPS 48(PTR), X3 \
|
|
MULPD SCAL, X0 \
|
|
MULPD SCAL, X1 \
|
|
MULPD SCAL, X2 \
|
|
MULPD SCAL, X3 \
|
|
MOVUPS X0, (PTR) \
|
|
MOVUPS X1, 16(PTR) \
|
|
MOVUPS X2, 32(PTR) \
|
|
MOVUPS X3, 48(PTR)
|
|
|
|
#define SCALE_4(PTR, SCAL) \
|
|
MOVUPS (PTR), X0 \
|
|
MOVUPS 16(PTR), X1 \
|
|
MULPD SCAL, X0 \
|
|
MULPD SCAL, X1 \
|
|
MOVUPS X0, (PTR) \
|
|
MOVUPS X1, 16(PTR) \
|
|
|
|
#define SCALE_2(PTR, SCAL) \
|
|
MOVUPS (PTR), X0 \
|
|
MULPD SCAL, X0 \
|
|
MOVUPS X0, (PTR) \
|
|
|
|
#define SCALE_1(PTR, SCAL) \
|
|
MOVSD (PTR), X0 \
|
|
MULSD SCAL, X0 \
|
|
MOVSD X0, (PTR) \
|
|
|
|
#define SCALEINC_4(PTR, INC, INC3, SCAL) \
|
|
MOVSD (PTR), X0 \
|
|
MOVSD (PTR)(INC*1), X1 \
|
|
MOVSD (PTR)(INC*2), X2 \
|
|
MOVSD (PTR)(INC3*1), X3 \
|
|
MULSD SCAL, X0 \
|
|
MULSD SCAL, X1 \
|
|
MULSD SCAL, X2 \
|
|
MULSD SCAL, X3 \
|
|
MOVSD X0, (PTR) \
|
|
MOVSD X1, (PTR)(INC*1) \
|
|
MOVSD X2, (PTR)(INC*2) \
|
|
MOVSD X3, (PTR)(INC3*1)
|
|
|
|
#define SCALEINC_2(PTR, INC, SCAL) \
|
|
MOVSD (PTR), X0 \
|
|
MOVSD (PTR)(INC*1), X1 \
|
|
MULSD SCAL, X0 \
|
|
MULSD SCAL, X1 \
|
|
MOVSD X0, (PTR) \
|
|
MOVSD X1, (PTR)(INC*1)
|
|
|
|
// func GemvT(m, n int,
|
|
// alpha float64,
|
|
// a []float64, lda int,
|
|
// x []float64, incX int,
|
|
// beta float64,
|
|
// y []float64, incY int)
|
|
TEXT ·GemvT(SB), NOSPLIT, $32-128
|
|
MOVQ M_DIM, M
|
|
MOVQ N_DIM, N
|
|
CMPQ M, $0
|
|
JE end
|
|
CMPQ N, $0
|
|
JE end
|
|
|
|
MOVDDUP alpha+16(FP), ALPHA
|
|
|
|
MOVQ x_base+56(FP), X_PTR
|
|
MOVQ y_base+96(FP), Y_PTR
|
|
MOVQ a_base+24(FP), A_ROW
|
|
MOVQ incY+120(FP), INC_Y // INC_Y = incY * sizeof(float64)
|
|
MOVQ lda+48(FP), LDA // LDA = LDA * sizeof(float64)
|
|
SHLQ $3, LDA
|
|
LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3
|
|
MOVQ A_ROW, A_PTR
|
|
|
|
MOVQ incX+80(FP), INC_X // INC_X = incX * sizeof(float64)
|
|
|
|
XORQ TMP2, TMP2
|
|
MOVQ N, TMP1
|
|
SUBQ $1, TMP1
|
|
NEGQ TMP1
|
|
IMULQ INC_X, TMP1
|
|
CMPQ INC_X, $0
|
|
CMOVQLT TMP1, TMP2
|
|
LEAQ (X_PTR)(TMP2*SIZE), X_PTR
|
|
MOVQ X_PTR, X
|
|
|
|
SHLQ $3, INC_X
|
|
LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
|
|
|
|
CMPQ incY+120(FP), $1 // Check for dense vector Y (fast-path)
|
|
JNE inc
|
|
|
|
MOVSD $1.0, X0
|
|
COMISD beta+88(FP), X0
|
|
JE gemv_start
|
|
|
|
MOVSD $0.0, X0
|
|
COMISD beta+88(FP), X0
|
|
JE gemv_clear
|
|
|
|
MOVDDUP beta+88(FP), BETA
|
|
SHRQ $3, M
|
|
JZ scal4
|
|
|
|
scal8:
|
|
SCALE_8(Y_PTR, BETA)
|
|
ADDQ $8*SIZE, Y_PTR
|
|
DECQ M
|
|
JNZ scal8
|
|
|
|
scal4:
|
|
TESTQ $4, M_DIM
|
|
JZ scal2
|
|
SCALE_4(Y_PTR, BETA)
|
|
ADDQ $4*SIZE, Y_PTR
|
|
|
|
scal2:
|
|
TESTQ $2, M_DIM
|
|
JZ scal1
|
|
SCALE_2(Y_PTR, BETA)
|
|
ADDQ $2*SIZE, Y_PTR
|
|
|
|
scal1:
|
|
TESTQ $1, M_DIM
|
|
JZ prep_end
|
|
SCALE_1(Y_PTR, BETA)
|
|
|
|
JMP prep_end
|
|
|
|
gemv_clear: // beta == 0 is special cased to clear memory (no nan handling)
|
|
XORPS X0, X0
|
|
XORPS X1, X1
|
|
XORPS X2, X2
|
|
XORPS X3, X3
|
|
|
|
SHRQ $3, M
|
|
JZ clear4
|
|
|
|
clear8:
|
|
MOVUPS X0, (Y_PTR)
|
|
MOVUPS X1, 16(Y_PTR)
|
|
MOVUPS X2, 32(Y_PTR)
|
|
MOVUPS X3, 48(Y_PTR)
|
|
ADDQ $8*SIZE, Y_PTR
|
|
DECQ M
|
|
JNZ clear8
|
|
|
|
clear4:
|
|
TESTQ $4, M_DIM
|
|
JZ clear2
|
|
MOVUPS X0, (Y_PTR)
|
|
MOVUPS X1, 16(Y_PTR)
|
|
ADDQ $4*SIZE, Y_PTR
|
|
|
|
clear2:
|
|
TESTQ $2, M_DIM
|
|
JZ clear1
|
|
MOVUPS X0, (Y_PTR)
|
|
ADDQ $2*SIZE, Y_PTR
|
|
|
|
clear1:
|
|
TESTQ $1, M_DIM
|
|
JZ prep_end
|
|
MOVSD X0, (Y_PTR)
|
|
|
|
prep_end:
|
|
MOVQ Y, Y_PTR
|
|
MOVQ M_DIM, M
|
|
|
|
gemv_start:
|
|
SHRQ $2, N
|
|
JZ c2
|
|
|
|
c4:
|
|
// LOAD 4
|
|
INIT4
|
|
|
|
MOVQ M_DIM, M
|
|
SHRQ $2, M
|
|
JZ c4r2
|
|
|
|
c4r4:
|
|
// 4x4 KERNEL
|
|
KERNEL_LOAD4
|
|
KERNEL_4x4
|
|
STORE4
|
|
|
|
ADDQ $4*SIZE, Y_PTR
|
|
|
|
DECQ M
|
|
JNZ c4r4
|
|
|
|
c4r2:
|
|
TESTQ $2, M_DIM
|
|
JZ c4r1
|
|
|
|
// 4x2 KERNEL
|
|
KERNEL_LOAD2
|
|
KERNEL_2x4
|
|
STORE2
|
|
|
|
ADDQ $2*SIZE, Y_PTR
|
|
|
|
c4r1:
|
|
TESTQ $1, M_DIM
|
|
JZ c4end
|
|
|
|
// 4x1 KERNEL
|
|
KERNEL_1x4
|
|
|
|
ADDQ $SIZE, Y_PTR
|
|
|
|
c4end:
|
|
LEAQ (X_PTR)(INC_X*4), X_PTR
|
|
MOVQ Y, Y_PTR
|
|
LEAQ (A_ROW)(LDA*4), A_ROW
|
|
MOVQ A_ROW, A_PTR
|
|
|
|
DECQ N
|
|
JNZ c4
|
|
|
|
c2:
|
|
TESTQ $2, N_DIM
|
|
JZ c1
|
|
|
|
// LOAD 2
|
|
INIT2
|
|
|
|
MOVQ M_DIM, M
|
|
SHRQ $2, M
|
|
JZ c2r2
|
|
|
|
c2r4:
|
|
// 2x4 KERNEL
|
|
KERNEL_LOAD4
|
|
KERNEL_4x2
|
|
STORE4
|
|
|
|
ADDQ $4*SIZE, Y_PTR
|
|
|
|
DECQ M
|
|
JNZ c2r4
|
|
|
|
c2r2:
|
|
TESTQ $2, M_DIM
|
|
JZ c2r1
|
|
|
|
// 2x2 KERNEL
|
|
KERNEL_LOAD2
|
|
KERNEL_2x2
|
|
STORE2
|
|
|
|
ADDQ $2*SIZE, Y_PTR
|
|
|
|
c2r1:
|
|
TESTQ $1, M_DIM
|
|
JZ c2end
|
|
|
|
// 2x1 KERNEL
|
|
KERNEL_1x2
|
|
|
|
ADDQ $SIZE, Y_PTR
|
|
|
|
c2end:
|
|
LEAQ (X_PTR)(INC_X*2), X_PTR
|
|
MOVQ Y, Y_PTR
|
|
LEAQ (A_ROW)(LDA*2), A_ROW
|
|
MOVQ A_ROW, A_PTR
|
|
|
|
c1:
|
|
TESTQ $1, N_DIM
|
|
JZ end
|
|
|
|
// LOAD 1
|
|
INIT1
|
|
|
|
MOVQ M_DIM, M
|
|
SHRQ $2, M
|
|
JZ c1r2
|
|
|
|
c1r4:
|
|
// 1x4 KERNEL
|
|
KERNEL_LOAD4
|
|
KERNEL_4x1
|
|
STORE4
|
|
|
|
ADDQ $4*SIZE, Y_PTR
|
|
|
|
DECQ M
|
|
JNZ c1r4
|
|
|
|
c1r2:
|
|
TESTQ $2, M_DIM
|
|
JZ c1r1
|
|
|
|
// 1x2 KERNEL
|
|
KERNEL_LOAD2
|
|
KERNEL_2x1
|
|
STORE2
|
|
|
|
ADDQ $2*SIZE, Y_PTR
|
|
|
|
c1r1:
|
|
TESTQ $1, M_DIM
|
|
JZ end
|
|
|
|
// 1x1 KERNEL
|
|
KERNEL_1x1
|
|
|
|
end:
|
|
RET
|
|
|
|
inc: // Algorithm for incX != 0 ( split loads in kernel )
|
|
XORQ TMP2, TMP2
|
|
MOVQ M, TMP1
|
|
SUBQ $1, TMP1
|
|
IMULQ INC_Y, TMP1
|
|
NEGQ TMP1
|
|
CMPQ INC_Y, $0
|
|
CMOVQLT TMP1, TMP2
|
|
LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR
|
|
MOVQ Y_PTR, Y
|
|
|
|
SHLQ $3, INC_Y
|
|
LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
|
|
|
|
MOVSD $1.0, X0
|
|
COMISD beta+88(FP), X0
|
|
JE inc_gemv_start
|
|
|
|
MOVSD $0.0, X0
|
|
COMISD beta+88(FP), X0
|
|
JE inc_gemv_clear
|
|
|
|
MOVDDUP beta+88(FP), BETA
|
|
SHRQ $2, M
|
|
JZ inc_scal2
|
|
|
|
inc_scal4:
|
|
SCALEINC_4(Y_PTR, INC_Y, INC3_Y, BETA)
|
|
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
|
DECQ M
|
|
JNZ inc_scal4
|
|
|
|
inc_scal2:
|
|
TESTQ $2, M_DIM
|
|
JZ inc_scal1
|
|
|
|
SCALEINC_2(Y_PTR, INC_Y, BETA)
|
|
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
|
|
|
|
inc_scal1:
|
|
TESTQ $1, M_DIM
|
|
JZ inc_prep_end
|
|
SCALE_1(Y_PTR, BETA)
|
|
|
|
JMP inc_prep_end
|
|
|
|
inc_gemv_clear: // beta == 0 is special-cased to clear memory (no nan handling)
|
|
XORPS X0, X0
|
|
XORPS X1, X1
|
|
XORPS X2, X2
|
|
XORPS X3, X3
|
|
|
|
SHRQ $2, M
|
|
JZ inc_clear2
|
|
|
|
inc_clear4:
|
|
MOVSD X0, (Y_PTR)
|
|
MOVSD X1, (Y_PTR)(INC_Y*1)
|
|
MOVSD X2, (Y_PTR)(INC_Y*2)
|
|
MOVSD X3, (Y_PTR)(INC3_Y*1)
|
|
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
|
DECQ M
|
|
JNZ inc_clear4
|
|
|
|
inc_clear2:
|
|
TESTQ $2, M_DIM
|
|
JZ inc_clear1
|
|
MOVSD X0, (Y_PTR)
|
|
MOVSD X1, (Y_PTR)(INC_Y*1)
|
|
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
|
|
|
|
inc_clear1:
|
|
TESTQ $1, M_DIM
|
|
JZ inc_prep_end
|
|
MOVSD X0, (Y_PTR)
|
|
|
|
inc_prep_end:
|
|
MOVQ Y, Y_PTR
|
|
MOVQ M_DIM, M
|
|
|
|
inc_gemv_start:
|
|
SHRQ $2, N
|
|
JZ inc_c2
|
|
|
|
inc_c4:
|
|
// LOAD 4
|
|
INIT4
|
|
|
|
MOVQ M_DIM, M
|
|
SHRQ $2, M
|
|
JZ inc_c4r2
|
|
|
|
inc_c4r4:
|
|
// 4x4 KERNEL
|
|
KERNEL_LOAD4_INC
|
|
KERNEL_4x4
|
|
STORE4_INC
|
|
|
|
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
|
|
|
DECQ M
|
|
JNZ inc_c4r4
|
|
|
|
inc_c4r2:
|
|
TESTQ $2, M_DIM
|
|
JZ inc_c4r1
|
|
|
|
// 4x2 KERNEL
|
|
KERNEL_LOAD2_INC
|
|
KERNEL_2x4
|
|
STORE2_INC
|
|
|
|
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
|
|
|
|
inc_c4r1:
|
|
TESTQ $1, M_DIM
|
|
JZ inc_c4end
|
|
|
|
// 4x1 KERNEL
|
|
KERNEL_1x4
|
|
|
|
ADDQ INC_Y, Y_PTR
|
|
|
|
inc_c4end:
|
|
LEAQ (X_PTR)(INC_X*4), X_PTR
|
|
MOVQ Y, Y_PTR
|
|
LEAQ (A_ROW)(LDA*4), A_ROW
|
|
MOVQ A_ROW, A_PTR
|
|
|
|
DECQ N
|
|
JNZ inc_c4
|
|
|
|
inc_c2:
|
|
TESTQ $2, N_DIM
|
|
JZ inc_c1
|
|
|
|
// LOAD 2
|
|
INIT2
|
|
|
|
MOVQ M_DIM, M
|
|
SHRQ $2, M
|
|
JZ inc_c2r2
|
|
|
|
inc_c2r4:
|
|
// 2x4 KERNEL
|
|
KERNEL_LOAD4_INC
|
|
KERNEL_4x2
|
|
STORE4_INC
|
|
|
|
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
|
DECQ M
|
|
JNZ inc_c2r4
|
|
|
|
inc_c2r2:
|
|
TESTQ $2, M_DIM
|
|
JZ inc_c2r1
|
|
|
|
// 2x2 KERNEL
|
|
KERNEL_LOAD2_INC
|
|
KERNEL_2x2
|
|
STORE2_INC
|
|
|
|
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
|
|
|
|
inc_c2r1:
|
|
TESTQ $1, M_DIM
|
|
JZ inc_c2end
|
|
|
|
// 2x1 KERNEL
|
|
KERNEL_1x2
|
|
|
|
ADDQ INC_Y, Y_PTR
|
|
|
|
inc_c2end:
|
|
LEAQ (X_PTR)(INC_X*2), X_PTR
|
|
MOVQ Y, Y_PTR
|
|
LEAQ (A_ROW)(LDA*2), A_ROW
|
|
MOVQ A_ROW, A_PTR
|
|
|
|
inc_c1:
|
|
TESTQ $1, N_DIM
|
|
JZ inc_end
|
|
|
|
// LOAD 1
|
|
INIT1
|
|
|
|
MOVQ M_DIM, M
|
|
SHRQ $2, M
|
|
JZ inc_c1r2
|
|
|
|
inc_c1r4:
|
|
// 1x4 KERNEL
|
|
KERNEL_LOAD4_INC
|
|
KERNEL_4x1
|
|
STORE4_INC
|
|
|
|
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
|
DECQ M
|
|
JNZ inc_c1r4
|
|
|
|
inc_c1r2:
|
|
TESTQ $2, M_DIM
|
|
JZ inc_c1r1
|
|
|
|
// 1x2 KERNEL
|
|
KERNEL_LOAD2_INC
|
|
KERNEL_2x1
|
|
STORE2_INC
|
|
|
|
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
|
|
|
|
inc_c1r1:
|
|
TESTQ $1, M_DIM
|
|
JZ inc_end
|
|
|
|
// 1x1 KERNEL
|
|
KERNEL_1x1
|
|
|
|
inc_end:
|
|
RET
|