mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-31 04:26:37 +08:00 
			
		
		
		
	 91326dc942
			
		
	
	91326dc942
	
	
	
		
			
			Otherwise nasm writes the full host-specific paths into .o output, which breaks binary reproducibility. Signed-off-by: Alexander Kanavin <alex.kanavin@gmail.com> Signed-off-by: Anton Khirnov <anton@khirnov.net>
		
			
				
	
	
		
			1295 lines
		
	
	
		
			40 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			1295 lines
		
	
	
		
			40 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
| ;******************************************************************************
 | |
| ;* Copyright (c) Lynne
 | |
| ;*
 | |
| ;* This file is part of FFmpeg.
 | |
| ;*
 | |
| ;* FFmpeg is free software; you can redistribute it and/or
 | |
| ;* modify it under the terms of the GNU Lesser General Public
 | |
| ;* License as published by the Free Software Foundation; either
 | |
| ;* version 2.1 of the License, or (at your option) any later version.
 | |
| ;*
 | |
| ;* FFmpeg is distributed in the hope that it will be useful,
 | |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| ;* Lesser General Public License for more details.
 | |
| ;*
 | |
| ;* You should have received a copy of the GNU Lesser General Public
 | |
| ;* License along with FFmpeg; if not, write to the Free Software
 | |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
| ;******************************************************************************
 | |
| 
 | |
| ; Open `doc/transforms.md` to see the code upon which the transforms here were
 | |
| ; based upon and compare.
 | |
| 
 | |
| ; TODO:
 | |
| ;       carry over registers from smaller transforms to save on ~8 loads/stores
 | |
| ;       check if vinsertf could be faster than verpm2f128 for duplication
 | |
| ;       even faster FFT8 (current one is very #instructions optimized)
 | |
| ;       replace some xors with blends + addsubs?
 | |
| ;       replace some shuffles with vblends?
 | |
| ;       avx512 split-radix
 | |
| 
 | |
| %include "libavutil/x86/x86util.asm"
 | |
| 
 | |
| %define private_prefix ff_tx
 | |
| 
 | |
| %if ARCH_X86_64
 | |
| %define ptr resq
 | |
| %else
 | |
| %define ptr resd
 | |
| %endif
 | |
| 
 | |
| %assign i 16
 | |
| %rep 14
 | |
| cextern tab_ %+ i %+ _float ; ff_tab_i_float...
 | |
| %assign i (i << 1)
 | |
| %endrep
 | |
| 
 | |
| struc AVTXContext
 | |
|     .len:          resd 1 ; Length
 | |
|     .inv           resd 1 ; Inverse flag
 | |
|     .map:           ptr 1 ; Lookup table(s)
 | |
|     .exp:           ptr 1 ; Exponentiation factors
 | |
|     .tmp:           ptr 1 ; Temporary data
 | |
| 
 | |
|     .sub:           ptr 1 ; Subcontexts
 | |
|     .fn:            ptr 4 ; Subcontext functions
 | |
|     .nb_sub:       resd 1 ; Subcontext count
 | |
| 
 | |
|     ; Everything else is inaccessible
 | |
| endstruc
 | |
| 
 | |
| SECTION_RODATA 32
 | |
| 
 | |
| %define POS 0x00000000
 | |
| %define NEG 0x80000000
 | |
| 
 | |
| %define M_SQRT1_2 0.707106781186547524401
 | |
| %define COS16_1   0.92387950420379638671875
 | |
| %define COS16_3   0.3826834261417388916015625
 | |
| 
 | |
| d8_mult_odd:   dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \
 | |
|                   M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2
 | |
| 
 | |
| s8_mult_odd:   dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 | |
| s8_perm_even:  dd 1, 3, 0, 2, 1, 3, 2, 0
 | |
| s8_perm_odd1:  dd 3, 3, 1, 1, 1, 1, 3, 3
 | |
| s8_perm_odd2:  dd 1, 2, 0, 3, 1, 0, 0, 1
 | |
| 
 | |
| s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2
 | |
| s16_mult_odd1: dd COS16_1,  COS16_1,  COS16_3,  COS16_3,  COS16_1, -COS16_1,  COS16_3, -COS16_3
 | |
| s16_mult_odd2: dd COS16_3, -COS16_3,  COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
 | |
| s16_perm:      dd 0, 1, 2, 3, 1, 0, 3, 2
 | |
| 
 | |
| mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
 | |
| mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
 | |
| mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
 | |
| mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG
 | |
| mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS
 | |
| mask_pmpmpmpm: times 4 dd POS, NEG
 | |
| 
 | |
| SECTION .text
 | |
| 
 | |
| ; Load complex values (64 bits) via a lookup table
 | |
| ; %1 - output register
 | |
| ; %2 - GRP of base input memory address
 | |
| ; %3 - GPR of LUT (int32_t indices) address
 | |
| ; %4 - LUT offset
 | |
| ; %5 - temporary GPR (only used if vgather is not used)
 | |
| ; %6 - temporary register (for avx only)
 | |
| ; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
 | |
| %macro LOAD64_LUT 5-7
 | |
| %if %0 > 6 && cpuflag(avx2)
 | |
|     pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
 | |
|     movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
 | |
|     vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
 | |
| %else
 | |
|     mov      %5d, [%3 + %4 + 0]
 | |
|     movsd  xmm%1, [%2 + %5q*8]
 | |
| %if mmsize == 32
 | |
|     mov      %5d, [%3 + %4 + 8]
 | |
|     movsd  xmm%6, [%2 + %5q*8]
 | |
| %endif
 | |
|     mov      %5d, [%3 + %4 + 4]
 | |
|     movhps xmm%1, [%2 + %5q*8]
 | |
| %if mmsize == 32
 | |
|     mov      %5d, [%3 + %4 + 12]
 | |
|     movhps xmm%6, [%2 + %5q*8]
 | |
|     vinsertf128 %1, %1, xmm%6, 1
 | |
| %endif
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| ; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode)
 | |
| ; %1 - coefficients (r0.reim, r1.reim)
 | |
| ; %2 - temporary
 | |
| %macro FFT2 2
 | |
|     shufps   %2, %1, %1, q3322
 | |
|     shufps   %1, %1, %1, q1100
 | |
| 
 | |
|     addsubps %1, %1, %2
 | |
| 
 | |
|     shufps   %1, %1, %1, q2031
 | |
| %endmacro
 | |
| 
 | |
| ; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode)
 | |
| ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
 | |
| ; %2 - odd coefficients  (r1.reim, r3.reim, r5.reim, r7.reim)
 | |
| ; %3 - temporary
 | |
| %macro FFT4 3
 | |
|     subps  %3, %1, %2         ;  r1234, [r5678]
 | |
|     addps  %1, %1, %2         ;  t1234, [t5678]
 | |
| 
 | |
|     shufps %2, %1, %3, q1010  ;  t12, r12
 | |
|     shufps %1, %1, %3, q2332  ;  t34, r43
 | |
| 
 | |
|     subps  %3, %2, %1         ;  a34, b32
 | |
|     addps  %2, %2, %1         ;  a12, b14
 | |
| 
 | |
|     shufps %1, %2, %3, q1010  ;  a1234     even
 | |
| 
 | |
|     shufps %2, %2, %3, q2332  ;  b1423
 | |
|     shufps %2, %2, %2, q1320  ;  b1234     odd
 | |
| %endmacro
 | |
| 
 | |
| ; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode)
 | |
| ; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim])
 | |
| ; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim])
 | |
| ; %3 - odd coefficients  (a1.reim, a3.reim, [b1.reim, b3.reim])
 | |
| ; %4 - odd coefficients  (a5.reim, a7.reim, [b5.reim, b7.reim])
 | |
| ; %5 - temporary
 | |
| ; %6 - temporary
 | |
| %macro FFT8 6
 | |
|     addps    %5, %1, %3               ; q1-8
 | |
|     addps    %6, %2, %4               ; k1-8
 | |
| 
 | |
|     subps    %1, %1, %3               ; r1-8
 | |
|     subps    %2, %2, %4               ; j1-8
 | |
| 
 | |
|     shufps   %4, %1, %1, q2323        ; r4343
 | |
|     shufps   %3, %5, %6, q3032        ; q34, k14
 | |
| 
 | |
|     shufps   %1, %1, %1, q1010        ; r1212
 | |
|     shufps   %5, %5, %6, q1210        ; q12, k32
 | |
| 
 | |
|     xorps    %4, %4, [mask_pmmppmmp]  ; r4343 * pmmp
 | |
|     addps    %6, %5, %3               ; s12, g12
 | |
| 
 | |
|     mulps    %2, %2, [d8_mult_odd]    ; r8 * d8_mult_odd
 | |
|     subps    %5, %5, %3               ; s34, g43
 | |
| 
 | |
|     addps    %3, %1, %4               ; z1234
 | |
|     unpcklpd %1, %6, %5               ; s1234
 | |
| 
 | |
|     shufps   %4, %2, %2, q2301        ; j2143
 | |
|     shufps   %6, %6, %5, q2332        ; g1234
 | |
| 
 | |
|     addsubps %2, %2, %4               ; l2143
 | |
|     shufps   %5, %2, %2, q0123        ; l3412
 | |
|     addsubps %5, %5, %2               ; t1234
 | |
| 
 | |
|     subps    %2, %1, %6               ; h1234 even
 | |
|     subps    %4, %3, %5               ; u1234 odd
 | |
| 
 | |
|     addps    %1, %1, %6               ; w1234 even
 | |
|     addps    %3, %3, %5               ; o1234 odd
 | |
| %endmacro
 | |
| 
 | |
| ; Single 8-point in-place complex FFT in 20 instructions
 | |
| ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
 | |
| ; %2 - odd coefficients  (r1.reim, r3.reim, r5.reim, r7.reim)
 | |
| ; %3 - temporary
 | |
| ; %4 - temporary
 | |
| %macro FFT8_AVX 4
 | |
|     subps      %3, %1, %2               ;  r1234, r5678
 | |
|     addps      %1, %1, %2               ;  q1234, q5678
 | |
| 
 | |
|     vpermilps  %2, %3, [s8_perm_odd1]   ;  r4422, r6688
 | |
|     shufps     %4, %1, %1, q3322        ;  q1122, q5566
 | |
| 
 | |
|     movsldup   %3, %3                   ;  r1133, r5577
 | |
|     shufps     %1, %1, %1, q1100        ;  q3344, q7788
 | |
| 
 | |
|     addsubps   %3, %3, %2               ;  z1234, z5678
 | |
|     addsubps   %1, %1, %4               ;  s3142, s7586
 | |
| 
 | |
|     mulps      %3, %3, [s8_mult_odd]    ;  z * s8_mult_odd
 | |
|     vpermilps  %1, %1, [s8_perm_even]   ;  s1234, s5687 !
 | |
| 
 | |
|     shufps     %2, %3, %3, q2332        ;   junk, z7887
 | |
|     xorps      %4, %1, [mask_mmmmpppm]  ;  e1234, e5687 !
 | |
| 
 | |
|     vpermilps  %3, %3, [s8_perm_odd2]   ;  z2314, z6556
 | |
|     vperm2f128 %1, %1, %4, 0x03         ;  e5687, s1234
 | |
| 
 | |
|     addsubps   %2, %2, %3               ;   junk, t5678
 | |
|     subps      %1, %1, %4               ;  w1234, w5678 even
 | |
| 
 | |
|     vperm2f128 %2, %2, %2, 0x11         ;  t5678, t5678
 | |
|     vperm2f128 %3, %3, %3, 0x00         ;  z2314, z2314
 | |
| 
 | |
|     xorps      %2, %2, [mask_ppmpmmpm]  ;  t * ppmpmmpm
 | |
|     addps      %2, %3, %2               ;  u1234, u5678 odd
 | |
| %endmacro
 | |
| 
 | |
| ; Single 16-point in-place complex FFT
 | |
| ; %1 - even coefficients (r0.reim, r2.reim,  r4.reim,  r6.reim)
 | |
| ; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim)
 | |
| ; %3 - odd coefficients  (r1.reim, r3.reim,  r5.reim,  r7.reim)
 | |
| ; %4 - odd coefficients  (r9.reim, r11.reim, r13.reim, r15.reim)
 | |
| ; %5, %6 - temporary
 | |
| ; %7, %8 - temporary (optional)
 | |
| %macro FFT16 6-8
 | |
|     FFT4       %3, %4, %5
 | |
| %if %0 > 7
 | |
|     FFT8_AVX   %1, %2, %6, %7
 | |
|     movaps     %8, [mask_mpmppmpm]
 | |
|     movaps     %7, [s16_perm]
 | |
| %define mask %8
 | |
| %define perm %7
 | |
| %elif %0 > 6
 | |
|     FFT8_AVX   %1, %2, %6, %7
 | |
|     movaps     %7, [s16_perm]
 | |
| %define mask [mask_mpmppmpm]
 | |
| %define perm %7
 | |
| %else
 | |
|     FFT8_AVX   %1, %2, %6, %5
 | |
| %define mask [mask_mpmppmpm]
 | |
| %define perm [s16_perm]
 | |
| %endif
 | |
|     xorps      %5, %5, %5                   ; 0
 | |
| 
 | |
|     shufps     %6, %4, %4, q2301            ; z12.imre, z13.imre...
 | |
|     shufps     %5, %5, %3, q2301            ; 0, 0, z8.imre...
 | |
| 
 | |
|     mulps      %4, %4, [s16_mult_odd1]      ; z.reim * costab
 | |
|     xorps      %5, %5, [mask_mppmmpmp]
 | |
| %if cpuflag(fma3)
 | |
|     fmaddps    %6, %6, [s16_mult_odd2], %4  ; s[8..15]
 | |
|     addps      %5, %3, %5                   ; s[0...7]
 | |
| %else
 | |
|     mulps      %6, %6, [s16_mult_odd2]      ; z.imre * costab
 | |
| 
 | |
|     addps      %5, %3, %5                   ; s[0...7]
 | |
|     addps      %6, %4, %6                   ; s[8..15]
 | |
| %endif
 | |
|     mulps      %5, %5, [s16_mult_even]      ; s[0...7]*costab
 | |
| 
 | |
|     xorps      %4, %6, mask                 ; s[8..15]*mpmppmpm
 | |
|     xorps      %3, %5, mask                 ; s[0...7]*mpmppmpm
 | |
| 
 | |
|     vperm2f128 %4, %4, %4, 0x01             ; s[12..15, 8..11]
 | |
|     vperm2f128 %3, %3, %3, 0x01             ; s[4..7, 0..3]
 | |
| 
 | |
|     addps      %6, %6, %4                   ; y56, u56, y34, u34
 | |
|     addps      %5, %5, %3                   ; w56, x56, w34, x34
 | |
| 
 | |
|     vpermilps  %6, %6, perm                 ; y56, u56, y43, u43
 | |
|     vpermilps  %5, %5, perm                 ; w56, x56, w43, x43
 | |
| 
 | |
|     subps      %4, %2, %6                   ; odd  part 2
 | |
|     addps      %3, %2, %6                   ; odd  part 1
 | |
| 
 | |
|     subps      %2, %1, %5                   ; even part 2
 | |
|     addps      %1, %1, %5                   ; even part 1
 | |
| %undef mask
 | |
| %undef perm
 | |
| %endmacro
 | |
| 
 | |
| ; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
 | |
| ; Uses all 16 of registers.
 | |
| ; Output is slightly permuted such that tx2,3's coefficients are interleaved
 | |
| ; on a 2-point basis (look at `doc/transforms.md`)
 | |
| %macro SPLIT_RADIX_COMBINE 17
 | |
| %if %1 && mmsize == 32
 | |
|     vperm2f128 %14, %6, %7, 0x20     ; m2[0], m2[1], m3[0], m3[1] even
 | |
|     vperm2f128 %16, %9, %8, 0x20     ; m2[0], m2[1], m3[0], m3[1] odd
 | |
|     vperm2f128 %15, %6, %7, 0x31     ; m2[2], m2[3], m3[2], m3[3] even
 | |
|     vperm2f128 %17, %9, %8, 0x31     ; m2[2], m2[3], m3[2], m3[3] odd
 | |
| %endif
 | |
| 
 | |
|     shufps     %12, %10, %10, q2200  ; cos00224466
 | |
|     shufps     %13, %11, %11, q1133  ; wim77553311
 | |
|     movshdup   %10, %10              ; cos11335577
 | |
|     shufps     %11, %11, %11, q0022  ; wim66442200
 | |
| 
 | |
| %if %1 && mmsize == 32
 | |
|     shufps     %6, %14, %14, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
 | |
|     shufps     %8, %16, %16, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
 | |
|     shufps     %7, %15, %15, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
 | |
|     shufps     %9, %17, %17, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
 | |
| 
 | |
|     mulps      %14, %14, %13         ; m2[0123]reim * wim7531 even
 | |
|     mulps      %16, %16, %11         ; m2[0123]reim * wim7531 odd
 | |
|     mulps      %15, %15, %13         ; m3[0123]reim * wim7531 even
 | |
|     mulps      %17, %17, %11         ; m3[0123]reim * wim7531 odd
 | |
| %else
 | |
|     mulps      %14, %6, %13          ; m2,3[01]reim * wim7531 even
 | |
|     mulps      %16, %8, %11          ; m2,3[01]reim * wim7531 odd
 | |
|     mulps      %15, %7, %13          ; m2,3[23]reim * wim7531 even
 | |
|     mulps      %17, %9, %11          ; m2,3[23]reim * wim7531 odd
 | |
|     ; reorder the multiplies to save movs reg, reg in the %if above
 | |
|     shufps     %6, %6, %6, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
 | |
|     shufps     %8, %8, %8, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
 | |
|     shufps     %7, %7, %7, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
 | |
|     shufps     %9, %9, %9, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
 | |
| %endif
 | |
| 
 | |
| %if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA!
 | |
|     fmaddsubps %6, %6, %12, %14      ; w[0..8] even
 | |
|     fmaddsubps %8, %8, %10, %16      ; w[0..8] odd
 | |
|     fmsubaddps %7, %7, %12, %15      ; j[0..8] even
 | |
|     fmsubaddps %9, %9, %10, %17      ; j[0..8] odd
 | |
|     movaps     %13, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
 | |
| %else
 | |
|     mulps      %6, %6, %12           ; m2,3[01]imre * cos0246
 | |
|     mulps      %8, %8, %10           ; m2,3[01]imre * cos0246
 | |
|     movaps     %13, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
 | |
|     mulps      %7, %7, %12           ; m2,3[23]reim * cos0246
 | |
|     mulps      %9, %9, %10           ; m2,3[23]reim * cos0246
 | |
|     addsubps   %6, %6, %14           ; w[0..8]
 | |
|     addsubps   %8, %8, %16           ; w[0..8]
 | |
|     xorps      %15, %15, %13         ; +-m2,3[23]imre * wim7531
 | |
|     xorps      %17, %17, %13         ; +-m2,3[23]imre * wim7531
 | |
|     addps      %7, %7, %15           ; j[0..8]
 | |
|     addps      %9, %9, %17           ; j[0..8]
 | |
| %endif
 | |
| 
 | |
|     addps      %14, %6, %7           ; t10235476 even
 | |
|     addps      %16, %8, %9           ; t10235476 odd
 | |
|     subps      %15, %6, %7           ; +-r[0..7] even
 | |
|     subps      %17, %8, %9           ; +-r[0..7] odd
 | |
| 
 | |
|     shufps     %14, %14, %14, q2301  ; t[0..7] even
 | |
|     shufps     %16, %16, %16, q2301  ; t[0..7] odd
 | |
|     xorps      %15, %15, %13         ; r[0..7] even
 | |
|     xorps      %17, %17, %13         ; r[0..7] odd
 | |
| 
 | |
|     subps      %6, %2, %14           ; m2,3[01] even
 | |
|     subps      %8, %4, %16           ; m2,3[01] odd
 | |
|     subps      %7, %3, %15           ; m2,3[23] even
 | |
|     subps      %9, %5, %17           ; m2,3[23] odd
 | |
| 
 | |
|     addps      %2, %2, %14           ; m0 even
 | |
|     addps      %4, %4, %16           ; m0 odd
 | |
|     addps      %3, %3, %15           ; m1 even
 | |
|     addps      %5, %5, %17           ; m1 odd
 | |
| %endmacro
 | |
| 
 | |
| ; Same as above, only does one parity at a time, takes 3 temporary registers,
 | |
| ; however, if the twiddles aren't needed after this, the registers they use
 | |
| ; can be used as any of the temporary registers.
 | |
| %macro SPLIT_RADIX_COMBINE_HALF 10
 | |
| %if %1
 | |
|     shufps     %8, %6, %6, q2200     ; cos00224466
 | |
|     shufps     %9, %7, %7, q1133     ; wim77553311
 | |
| %else
 | |
|     shufps     %8, %6, %6, q3311     ; cos11335577
 | |
|     shufps     %9, %7, %7, q0022     ; wim66442200
 | |
| %endif
 | |
| 
 | |
|     mulps      %10, %4, %9           ; m2,3[01]reim * wim7531 even
 | |
|     mulps      %9, %9, %5            ; m2,3[23]reim * wim7531 even
 | |
| 
 | |
|     shufps     %4, %4, %4, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
 | |
|     shufps     %5, %5, %5, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
 | |
| 
 | |
| %if cpuflag(fma3)
 | |
|     fmaddsubps %4, %4, %8, %10       ; w[0..8] even
 | |
|     fmsubaddps %5, %5, %8, %9        ; j[0..8] even
 | |
|     movaps     %10, [mask_pmpmpmpm]
 | |
| %else
 | |
|     mulps      %4, %4, %8            ; m2,3[01]imre * cos0246
 | |
|     mulps      %5, %5, %8            ; m2,3[23]reim * cos0246
 | |
|     addsubps   %4, %4, %10           ; w[0..8]
 | |
|     movaps     %10, [mask_pmpmpmpm]
 | |
|     xorps      %9, %9, %10           ; +-m2,3[23]imre * wim7531
 | |
|     addps      %5, %5, %9            ; j[0..8]
 | |
| %endif
 | |
| 
 | |
|     addps      %8, %4, %5            ; t10235476
 | |
|     subps      %9, %4, %5            ; +-r[0..7]
 | |
| 
 | |
|     shufps     %8, %8, %8, q2301     ; t[0..7]
 | |
|     xorps      %9, %9, %10           ; r[0..7]
 | |
| 
 | |
|     subps      %4, %2, %8            ; %3,3[01]
 | |
|     subps      %5, %3, %9            ; %3,3[23]
 | |
| 
 | |
|     addps      %2, %2, %8            ; m0
 | |
|     addps      %3, %3, %9            ; m1
 | |
| %endmacro
 | |
| 
 | |
| ; Same as above, tries REALLY hard to use 2 temporary registers.
 | |
| %macro SPLIT_RADIX_COMBINE_LITE 9
 | |
| %if %1
 | |
|     shufps     %8, %6, %6, q2200        ; cos00224466
 | |
|     shufps     %9, %7, %7, q1133        ; wim77553311
 | |
| %else
 | |
|     shufps     %8, %6, %6, q3311        ; cos11335577
 | |
|     shufps     %9, %7, %7, q0022        ; wim66442200
 | |
| %endif
 | |
| 
 | |
|     mulps      %9, %9, %4               ; m2,3[01]reim * wim7531 even
 | |
|     shufps     %4, %4, %4, q2301        ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
 | |
| 
 | |
| %if cpuflag(fma3)
 | |
|     fmaddsubps %4, %4, %8, %9           ; w[0..8] even
 | |
| %else
 | |
|     mulps      %4, %4, %8               ; m2,3[01]imre * cos0246
 | |
|     addsubps   %4, %4, %9               ; w[0..8]
 | |
| %endif
 | |
| 
 | |
| %if %1
 | |
|     shufps     %9, %7, %7, q1133        ; wim77553311
 | |
| %else
 | |
|     shufps     %9, %7, %7, q0022        ; wim66442200
 | |
| %endif
 | |
| 
 | |
|     mulps      %9, %9, %5               ; m2,3[23]reim * wim7531 even
 | |
|     shufps     %5, %5, %5, q2301        ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
 | |
| %if cpuflag (fma3)
 | |
|     fmsubaddps %5, %5, %8, %9           ; j[0..8] even
 | |
| %else
 | |
|     mulps      %5, %5, %8               ; m2,3[23]reim * cos0246
 | |
|     xorps      %9, %9, [mask_pmpmpmpm]  ; +-m2,3[23]imre * wim7531
 | |
|     addps      %5, %5, %9               ; j[0..8]
 | |
| %endif
 | |
| 
 | |
|     addps      %8, %4, %5               ; t10235476
 | |
|     subps      %9, %4, %5               ; +-r[0..7]
 | |
| 
 | |
|     shufps     %8, %8, %8, q2301        ; t[0..7]
 | |
|     xorps      %9, %9, [mask_pmpmpmpm]  ; r[0..7]
 | |
| 
 | |
|     subps      %4, %2, %8               ; %3,3[01]
 | |
|     subps      %5, %3, %9               ; %3,3[23]
 | |
| 
 | |
|     addps      %2, %2, %8               ; m0
 | |
|     addps      %3, %3, %9               ; m1
 | |
| %endmacro
 | |
| 
 | |
| %macro SPLIT_RADIX_COMBINE_64 0
 | |
|     SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
 | |
| 
 | |
|     movaps [outq +  0*mmsize], m0
 | |
|     movaps [outq +  4*mmsize], m1
 | |
|     movaps [outq +  8*mmsize], tx1_e0
 | |
|     movaps [outq + 12*mmsize], tx2_e0
 | |
| 
 | |
|     SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0
 | |
| 
 | |
|     movaps [outq +  2*mmsize], m2
 | |
|     movaps [outq +  6*mmsize], m3
 | |
|     movaps [outq + 10*mmsize], tx1_o0
 | |
|     movaps [outq + 14*mmsize], tx2_o0
 | |
| 
 | |
|     movaps tw_e,           [tab_64_float + mmsize]
 | |
|     vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
 | |
| 
 | |
|     movaps m0, [outq +  1*mmsize]
 | |
|     movaps m1, [outq +  3*mmsize]
 | |
|     movaps m2, [outq +  5*mmsize]
 | |
|     movaps m3, [outq +  7*mmsize]
 | |
| 
 | |
|     SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
 | |
|                            tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
 | |
| 
 | |
|     movaps [outq +  1*mmsize], m0
 | |
|     movaps [outq +  3*mmsize], m1
 | |
|     movaps [outq +  5*mmsize], m2
 | |
|     movaps [outq +  7*mmsize], m3
 | |
| 
 | |
|     movaps [outq +  9*mmsize], tx1_e1
 | |
|     movaps [outq + 11*mmsize], tx1_o1
 | |
|     movaps [outq + 13*mmsize], tx2_e1
 | |
|     movaps [outq + 15*mmsize], tx2_o1
 | |
| %endmacro
 | |
| 
 | |
| ; Perform a single even/odd split radix combination with loads and stores
 | |
| ; The _4 indicates this is a quarter of the iterations required to complete a full
 | |
| ; combine loop
 | |
| ; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6
 | |
| %macro SPLIT_RADIX_LOAD_COMBINE_4 8
 | |
|     movaps m8,         [rtabq + (%5)*mmsize + %7]
 | |
|     vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23
 | |
| 
 | |
|     movaps m0, [outq +      (0 + %4)*mmsize + %6]
 | |
|     movaps m2, [outq +      (2 + %4)*mmsize + %6]
 | |
|     movaps m1, [outq + %1 + (0 + %4)*mmsize + %6]
 | |
|     movaps m3, [outq + %1 + (2 + %4)*mmsize + %6]
 | |
| 
 | |
|     movaps m4, [outq + %2 + (0 + %4)*mmsize + %6]
 | |
|     movaps m6, [outq + %2 + (2 + %4)*mmsize + %6]
 | |
|     movaps m5, [outq + %3 + (0 + %4)*mmsize + %6]
 | |
|     movaps m7, [outq + %3 + (2 + %4)*mmsize + %6]
 | |
| 
 | |
|     SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
 | |
|                            m4, m5, m6, m7, \
 | |
|                            m8, m9, \
 | |
|                            m10, m11, m12, m13, m14, m15
 | |
| 
 | |
|     movaps [outq +      (0 + %4)*mmsize + %6], m0
 | |
|     movaps [outq +      (2 + %4)*mmsize + %6], m2
 | |
|     movaps [outq + %1 + (0 + %4)*mmsize + %6], m1
 | |
|     movaps [outq + %1 + (2 + %4)*mmsize + %6], m3
 | |
| 
 | |
|     movaps [outq + %2 + (0 + %4)*mmsize + %6], m4
 | |
|     movaps [outq + %2 + (2 + %4)*mmsize + %6], m6
 | |
|     movaps [outq + %3 + (0 + %4)*mmsize + %6], m5
 | |
|     movaps [outq + %3 + (2 + %4)*mmsize + %6], m7
 | |
| %endmacro
 | |
| 
 | |
| %macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5
 | |
| %if %0 > 2
 | |
| %define offset_c %3
 | |
| %else
 | |
| %define offset_c 0
 | |
| %endif
 | |
| %if %0 > 3
 | |
| %define offset_r %4
 | |
| %else
 | |
| %define offset_r 0
 | |
| %endif
 | |
| %if %0 > 4
 | |
| %define offset_i %5
 | |
| %else
 | |
| %define offset_i 0
 | |
| %endif
 | |
| 
 | |
|     SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i
 | |
|     SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i
 | |
|     SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i
 | |
|     SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i
 | |
| %endmacro
 | |
| 
 | |
| ; Perform a single even/odd split radix combination with loads, deinterleaves and
 | |
| ; stores. The _2 indicates this is a half of the iterations required to complete
 | |
| ; a full combine+deinterleave loop
 | |
| ; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6
 | |
| %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6
 | |
|     movaps m8,         [rtabq + (0 + %2)*mmsize]
 | |
|     vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23
 | |
| 
 | |
|     movaps m0, [outq +      (0 + 0 + %1)*mmsize + %6]
 | |
|     movaps m2, [outq +      (2 + 0 + %1)*mmsize + %6]
 | |
|     movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6]
 | |
|     movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6]
 | |
| 
 | |
|     movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6]
 | |
|     movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6]
 | |
|     movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6]
 | |
|     movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6]
 | |
| 
 | |
|     SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
 | |
|        m4, m5, m6, m7, \
 | |
|        m8, m9, \
 | |
|        m10, m11, m12, m13, m14, m15
 | |
| 
 | |
|     unpckhpd m10, m0, m2
 | |
|     unpckhpd m11, m1, m3
 | |
|     unpckhpd m12, m4, m6
 | |
|     unpckhpd m13, m5, m7
 | |
|     unpcklpd m0, m0, m2
 | |
|     unpcklpd m1, m1, m3
 | |
|     unpcklpd m4, m4, m6
 | |
|     unpcklpd m5, m5, m7
 | |
| 
 | |
|     vextractf128 [outq +      (0 + 0 + %1)*mmsize + %6 +  0], m0,  0
 | |
|     vextractf128 [outq +      (0 + 0 + %1)*mmsize + %6 + 16], m10, 0
 | |
|     vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 +  0], m1,  0
 | |
|     vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0
 | |
| 
 | |
|     vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 +  0], m4,  0
 | |
|     vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0
 | |
|     vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 +  0], m5,  0
 | |
|     vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0
 | |
| 
 | |
|     vperm2f128 m10, m10, m0, 0x13
 | |
|     vperm2f128 m11, m11, m1, 0x13
 | |
|     vperm2f128 m12, m12, m4, 0x13
 | |
|     vperm2f128 m13, m13, m5, 0x13
 | |
| 
 | |
|     movaps m8,         [rtabq + (1 + %2)*mmsize]
 | |
|     vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23
 | |
| 
 | |
|     movaps m0, [outq +      (0 + 1 + %1)*mmsize + %6]
 | |
|     movaps m2, [outq +      (2 + 1 + %1)*mmsize + %6]
 | |
|     movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6]
 | |
|     movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6]
 | |
| 
 | |
|     movaps [outq +      (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict
 | |
|     movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict
 | |
| 
 | |
|     movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6]
 | |
|     movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6]
 | |
|     movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6]
 | |
|     movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6]
 | |
| 
 | |
|     movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict
 | |
|     movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict
 | |
| 
 | |
|     SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
 | |
|                            m4, m5, m6, m7, \
 | |
|                            m8, m9, \
 | |
|                            m10, m11, m12, m13, m14, m15 ; temporary registers
 | |
| 
 | |
|     unpcklpd m8,  m0, m2
 | |
|     unpcklpd m9,  m1, m3
 | |
|     unpcklpd m10, m4, m6
 | |
|     unpcklpd m11, m5, m7
 | |
|     unpckhpd m0, m0, m2
 | |
|     unpckhpd m1, m1, m3
 | |
|     unpckhpd m4, m4, m6
 | |
|     unpckhpd m5, m5, m7
 | |
| 
 | |
|     vextractf128 [outq +      (2 + 0 + %1)*mmsize + %6 +  0], m8,  0
 | |
|     vextractf128 [outq +      (2 + 0 + %1)*mmsize + %6 + 16], m0,  0
 | |
|     vextractf128 [outq +      (2 + 1 + %1)*mmsize + %6 +  0], m8,  1
 | |
|     vextractf128 [outq +      (2 + 1 + %1)*mmsize + %6 + 16], m0,  1
 | |
| 
 | |
|     vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 +  0], m9,  0
 | |
|     vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1,  0
 | |
|     vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 +  0], m9,  1
 | |
|     vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1,  1
 | |
| 
 | |
|     vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 +  0], m10, 0
 | |
|     vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4,  0
 | |
|     vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 +  0], m10, 1
 | |
|     vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4,  1
 | |
| 
 | |
|     vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 +  0], m11, 0
 | |
|     vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5,  0
 | |
|     vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 +  0], m11, 1
 | |
|     vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5,  1
 | |
| %endmacro
 | |
| 
 | |
| %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3
 | |
| %if %0 > 2
 | |
| %define offset %3
 | |
| %else
 | |
| %define offset 0
 | |
| %endif
 | |
|     SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset
 | |
|     SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset
 | |
| %endmacro
 | |
| 
 | |
| INIT_XMM sse3
 | |
| cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
 | |
|     movaps m0, [inq]
 | |
|     FFT2 m0, m1
 | |
|     movaps [outq], m0
 | |
|     RET
 | |
| 
 | |
| %macro FFT4 2
 | |
| INIT_XMM sse2
 | |
| cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
 | |
|     movaps m0, [inq + 0*mmsize]
 | |
|     movaps m1, [inq + 1*mmsize]
 | |
| 
 | |
| %if %2
 | |
|     shufps m2, m1, m0, q3210
 | |
|     shufps m0, m0, m1, q3210
 | |
|     movaps m1, m2
 | |
| %endif
 | |
| 
 | |
|     FFT4 m0, m1, m2
 | |
| 
 | |
|     unpcklpd m2, m0, m1
 | |
|     unpckhpd m0, m0, m1
 | |
| 
 | |
|     movaps [outq + 0*mmsize], m2
 | |
|     movaps [outq + 1*mmsize], m0
 | |
| 
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| FFT4 fwd, 0
 | |
| FFT4 inv, 1
 | |
| 
 | |
| %macro FFT8_SSE_FN 2
 | |
| INIT_XMM sse3
 | |
| cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
 | |
| %if %2
 | |
|     mov ctxq, [ctxq + AVTXContext.map]
 | |
|     LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
 | |
|     LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
 | |
|     LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
 | |
|     LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
 | |
| %else
 | |
|     movaps m0, [inq + 0*mmsize]
 | |
|     movaps m1, [inq + 1*mmsize]
 | |
|     movaps m2, [inq + 2*mmsize]
 | |
|     movaps m3, [inq + 3*mmsize]
 | |
| %endif
 | |
| 
 | |
|     FFT8 m0, m1, m2, m3, m4, m5
 | |
| 
 | |
|     unpcklpd m4, m0, m3
 | |
|     unpcklpd m5, m1, m2
 | |
|     unpckhpd m0, m0, m3
 | |
|     unpckhpd m1, m1, m2
 | |
| 
 | |
|     movups [outq + 0*mmsize], m4
 | |
|     movups [outq + 1*mmsize], m0
 | |
|     movups [outq + 2*mmsize], m5
 | |
|     movups [outq + 3*mmsize], m1
 | |
| 
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| FFT8_SSE_FN float,    1
 | |
| FFT8_SSE_FN ns_float, 0
 | |
| 
 | |
| %macro FFT8_AVX_FN 2
 | |
| INIT_YMM avx
 | |
| cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
 | |
| %if %2
 | |
|     mov ctxq, [ctxq + AVTXContext.map]
 | |
|     LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
 | |
|     LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
 | |
| %else
 | |
|     movaps m0, [inq + 0*mmsize]
 | |
|     movaps m1, [inq + 1*mmsize]
 | |
| %endif
 | |
| 
 | |
|     FFT8_AVX m0, m1, m2, m3
 | |
| 
 | |
|     unpcklpd m2, m0, m1
 | |
|     unpckhpd m0, m0, m1
 | |
| 
 | |
|     ; Around 2% faster than 2x vperm2f128 + 2x movapd
 | |
|     vextractf128 [outq + 16*0], m2, 0
 | |
|     vextractf128 [outq + 16*1], m0, 0
 | |
|     vextractf128 [outq + 16*2], m2, 1
 | |
|     vextractf128 [outq + 16*3], m0, 1
 | |
| 
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| FFT8_AVX_FN float,    1
 | |
| FFT8_AVX_FN ns_float, 0
 | |
| 
 | |
| %macro FFT16_FN 3
 | |
| INIT_YMM %1
 | |
| cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
 | |
| %if %3
 | |
|     movaps m0, [inq + 0*mmsize]
 | |
|     movaps m1, [inq + 1*mmsize]
 | |
|     movaps m2, [inq + 2*mmsize]
 | |
|     movaps m3, [inq + 3*mmsize]
 | |
| %else
 | |
|     mov ctxq, [ctxq + AVTXContext.map]
 | |
|     LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
 | |
|     LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
 | |
|     LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
 | |
|     LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
 | |
| %endif
 | |
| 
 | |
|     FFT16 m0, m1, m2, m3, m4, m5, m6, m7
 | |
| 
 | |
|     unpcklpd m5, m1, m3
 | |
|     unpcklpd m4, m0, m2
 | |
|     unpckhpd m1, m1, m3
 | |
|     unpckhpd m0, m0, m2
 | |
| 
 | |
|     vextractf128 [outq + 16*0], m4, 0
 | |
|     vextractf128 [outq + 16*1], m0, 0
 | |
|     vextractf128 [outq + 16*2], m4, 1
 | |
|     vextractf128 [outq + 16*3], m0, 1
 | |
|     vextractf128 [outq + 16*4], m5, 0
 | |
|     vextractf128 [outq + 16*5], m1, 0
 | |
|     vextractf128 [outq + 16*6], m5, 1
 | |
|     vextractf128 [outq + 16*7], m1, 1
 | |
| 
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| FFT16_FN avx,  float,    0
 | |
| FFT16_FN avx,  ns_float, 1
 | |
| FFT16_FN fma3, float,    0
 | |
| FFT16_FN fma3, ns_float, 1
 | |
| 
 | |
| %macro FFT32_FN 3
 | |
| INIT_YMM %1
 | |
| cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
 | |
| %if %3
 | |
|     movaps m4, [inq + 4*mmsize]
 | |
|     movaps m5, [inq + 5*mmsize]
 | |
|     movaps m6, [inq + 6*mmsize]
 | |
|     movaps m7, [inq + 7*mmsize]
 | |
| %else
 | |
|     mov ctxq, [ctxq + AVTXContext.map]
 | |
|     LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq,  m8,  m9
 | |
|     LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
 | |
|     LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m12, m13
 | |
|     LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m14, m15
 | |
| %endif
 | |
| 
 | |
|     FFT8 m4, m5, m6, m7, m8, m9
 | |
| 
 | |
| %if %3
 | |
|     movaps m0, [inq + 0*mmsize]
 | |
|     movaps m1, [inq + 1*mmsize]
 | |
|     movaps m2, [inq + 2*mmsize]
 | |
|     movaps m3, [inq + 3*mmsize]
 | |
| %else
 | |
|     LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq,  m8,  m9
 | |
|     LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m10, m11
 | |
|     LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
 | |
|     LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
 | |
| %endif
 | |
| 
 | |
|     movaps m8,         [tab_32_float]
 | |
|     vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
 | |
| 
 | |
|     FFT16 m0, m1, m2, m3, m10, m11, m12, m13
 | |
| 
 | |
|     SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
 | |
|                            m10, m11, m12, m13, m14, m15 ; temporary registers
 | |
| 
 | |
|     unpcklpd  m9, m1, m3
 | |
|     unpcklpd m10, m5, m7
 | |
|     unpcklpd  m8, m0, m2
 | |
|     unpcklpd m11, m4, m6
 | |
|     unpckhpd  m1, m1, m3
 | |
|     unpckhpd  m5, m5, m7
 | |
|     unpckhpd  m0, m0, m2
 | |
|     unpckhpd  m4, m4, m6
 | |
| 
 | |
|     vextractf128 [outq + 16* 0],  m8, 0
 | |
|     vextractf128 [outq + 16* 1],  m0, 0
 | |
|     vextractf128 [outq + 16* 2],  m8, 1
 | |
|     vextractf128 [outq + 16* 3],  m0, 1
 | |
|     vextractf128 [outq + 16* 4],  m9, 0
 | |
|     vextractf128 [outq + 16* 5],  m1, 0
 | |
|     vextractf128 [outq + 16* 6],  m9, 1
 | |
|     vextractf128 [outq + 16* 7],  m1, 1
 | |
| 
 | |
|     vextractf128 [outq + 16* 8], m11, 0
 | |
|     vextractf128 [outq + 16* 9],  m4, 0
 | |
|     vextractf128 [outq + 16*10], m11, 1
 | |
|     vextractf128 [outq + 16*11],  m4, 1
 | |
|     vextractf128 [outq + 16*12], m10, 0
 | |
|     vextractf128 [outq + 16*13],  m5, 0
 | |
|     vextractf128 [outq + 16*14], m10, 1
 | |
|     vextractf128 [outq + 16*15],  m5, 1
 | |
| 
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| %if ARCH_X86_64
 | |
| FFT32_FN avx,  float,    0
 | |
| FFT32_FN avx,  ns_float, 1
 | |
| FFT32_FN fma3, float,    0
 | |
| FFT32_FN fma3, ns_float, 1
 | |
| %endif
 | |
| 
 | |
| %macro FFT_SPLIT_RADIX_DEF 1-2
 | |
| ALIGN 16
 | |
| .%1 %+ pt:
 | |
|     PUSH lenq
 | |
|     mov lenq, (%1/4)
 | |
| 
 | |
|     add outq, (%1*4) - (%1/1)
 | |
|     call .32pt
 | |
| 
 | |
|     add outq, (%1*2) - (%1/2) ; the synth loops also increment outq
 | |
|     call .32pt
 | |
| 
 | |
|     POP lenq
 | |
|     sub outq, (%1*4) + (%1*2) + (%1/2)
 | |
| 
 | |
|     lea rtabq, [tab_ %+ %1 %+ _float]
 | |
|     lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
 | |
| 
 | |
| %if %0 > 1
 | |
|     cmp tgtq, %1
 | |
|     je .deinterleave
 | |
| 
 | |
|     mov tmpq, %1
 | |
| 
 | |
| .synth_ %+ %1:
 | |
|     SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0
 | |
|     add outq, 8*mmsize
 | |
|     add rtabq, 4*mmsize
 | |
|     sub itabq, 4*mmsize
 | |
|     sub tmpq, 4*mmsize
 | |
|     jg .synth_ %+ %1
 | |
| 
 | |
|     cmp lenq, %1
 | |
|     jg %2 ; can't do math here, nasm doesn't get it
 | |
|     ret
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro FFT_SPLIT_RADIX_FN 3
 | |
| INIT_YMM %1
 | |
| cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
 | |
|     movsxd lenq, dword [lutq + AVTXContext.len]
 | |
|     mov lutq, [lutq + AVTXContext.map]
 | |
|     mov tgtq, lenq
 | |
| 
 | |
| ; Bottom-most/32-point transform ===============================================
 | |
| ALIGN 16
 | |
| .32pt:
 | |
| %if %3
 | |
|     movaps m4, [inq + 4*mmsize]
 | |
|     movaps m5, [inq + 5*mmsize]
 | |
|     movaps m6, [inq + 6*mmsize]
 | |
|     movaps m7, [inq + 7*mmsize]
 | |
| %else
 | |
|     LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq,  m8,  m9
 | |
|     LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m10, m11
 | |
|     LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m12, m13
 | |
|     LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m14, m15
 | |
| %endif
 | |
| 
 | |
|     FFT8 m4, m5, m6, m7, m8, m9
 | |
| 
 | |
| %if %3
 | |
|     movaps m0, [inq + 0*mmsize]
 | |
|     movaps m1, [inq + 1*mmsize]
 | |
|     movaps m2, [inq + 2*mmsize]
 | |
|     movaps m3, [inq + 3*mmsize]
 | |
| %else
 | |
|     LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq,  m8,  m9
 | |
|     LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m10, m11
 | |
|     LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
 | |
|     LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
 | |
| %endif
 | |
| 
 | |
|     movaps m8,         [tab_32_float]
 | |
|     vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
 | |
| 
 | |
|     FFT16 m0, m1, m2, m3, m10, m11, m12, m13
 | |
| 
 | |
|     SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
 | |
|                            m10, m11, m12, m13, m14, m15 ; temporary registers
 | |
| 
 | |
|     movaps [outq + 1*mmsize], m1
 | |
|     movaps [outq + 3*mmsize], m3
 | |
|     movaps [outq + 5*mmsize], m5
 | |
|     movaps [outq + 7*mmsize], m7
 | |
| 
 | |
| %if %3
 | |
|     add inq, 8*mmsize
 | |
| %else
 | |
|     add lutq, (mmsize/2)*8
 | |
| %endif
 | |
|     cmp lenq, 32
 | |
|     jg .64pt
 | |
| 
 | |
|     movaps [outq + 0*mmsize], m0
 | |
|     movaps [outq + 2*mmsize], m2
 | |
|     movaps [outq + 4*mmsize], m4
 | |
|     movaps [outq + 6*mmsize], m6
 | |
| 
 | |
|     ret
 | |
| 
 | |
| ; 64-point transform ===========================================================
 | |
| ALIGN 16
 | |
| .64pt:
 | |
| ; Helper defines, these make it easier to track what's happening
 | |
| %define tx1_e0 m4
 | |
| %define tx1_e1 m5
 | |
| %define tx1_o0 m6
 | |
| %define tx1_o1 m7
 | |
| %define tx2_e0 m8
 | |
| %define tx2_e1 m9
 | |
| %define tx2_o0 m10
 | |
| %define tx2_o1 m11
 | |
| %define tw_e m12
 | |
| %define tw_o m13
 | |
| %define tmp1 m14
 | |
| %define tmp2 m15
 | |
| 
 | |
|     SWAP m4, m1
 | |
|     SWAP m6, m3
 | |
| 
 | |
| %if %3
 | |
|     movaps tx1_e0, [inq + 0*mmsize]
 | |
|     movaps tx1_e1, [inq + 1*mmsize]
 | |
|     movaps tx1_o0, [inq + 2*mmsize]
 | |
|     movaps tx1_o1, [inq + 3*mmsize]
 | |
| %else
 | |
|     LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tw_o
 | |
|     LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tmp1, tmp2
 | |
|     LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tw_o
 | |
|     LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp1, tmp2
 | |
| %endif
 | |
| 
 | |
|     FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
 | |
| 
 | |
| %if %3
 | |
|     movaps tx2_e0, [inq + 4*mmsize]
 | |
|     movaps tx2_e1, [inq + 5*mmsize]
 | |
|     movaps tx2_o0, [inq + 6*mmsize]
 | |
|     movaps tx2_o1, [inq + 7*mmsize]
 | |
| %else
 | |
|     LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1, tmp2
 | |
|     LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_e, tw_o
 | |
|     LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tmp1, tmp2
 | |
|     LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e, tw_o
 | |
| %endif
 | |
| 
 | |
|     FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
 | |
| 
 | |
|     movaps tw_e,           [tab_64_float]
 | |
|     vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
 | |
| 
 | |
| %if %3
 | |
|     add inq, 8*mmsize
 | |
| %else
 | |
|     add lutq, (mmsize/2)*8
 | |
| %endif
 | |
|     cmp tgtq, 64
 | |
|     je .deinterleave
 | |
| 
 | |
|     SPLIT_RADIX_COMBINE_64
 | |
| 
 | |
|     cmp lenq, 64
 | |
|     jg .128pt
 | |
|     ret
 | |
| 
 | |
| ; 128-point transform ==========================================================
 | |
| ALIGN 16
 | |
| .128pt:
 | |
|     PUSH lenq
 | |
|     mov lenq, 32
 | |
| 
 | |
|     add outq, 16*mmsize
 | |
|     call .32pt
 | |
| 
 | |
|     add outq, 8*mmsize
 | |
|     call .32pt
 | |
| 
 | |
|     POP lenq
 | |
|     sub outq, 24*mmsize
 | |
| 
 | |
|     lea rtabq, [tab_128_float]
 | |
|     lea itabq, [tab_128_float + 128 - 4*7]
 | |
| 
 | |
|     cmp tgtq, 128
 | |
|     je .deinterleave
 | |
| 
 | |
|     SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128
 | |
| 
 | |
|     cmp lenq, 128
 | |
|     jg .256pt
 | |
|     ret
 | |
| 
 | |
| ; 256-point transform ==========================================================
 | |
| ALIGN 16
 | |
| .256pt:
 | |
|     PUSH lenq
 | |
|     mov lenq, 64
 | |
| 
 | |
|     add outq, 32*mmsize
 | |
|     call .32pt
 | |
| 
 | |
|     add outq, 16*mmsize
 | |
|     call .32pt
 | |
| 
 | |
|     POP lenq
 | |
|     sub outq, 48*mmsize
 | |
| 
 | |
|     lea rtabq, [tab_256_float]
 | |
|     lea itabq, [tab_256_float + 256 - 4*7]
 | |
| 
 | |
|     cmp tgtq, 256
 | |
|     je .deinterleave
 | |
| 
 | |
|     SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256
 | |
|     SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize
 | |
| 
 | |
|     cmp lenq, 256
 | |
|     jg .512pt
 | |
|     ret
 | |
| 
 | |
| ; 512-point transform ==========================================================
 | |
| ALIGN 16
 | |
| .512pt:
 | |
|     PUSH lenq
 | |
|     mov lenq, 128
 | |
| 
 | |
|     add outq, 64*mmsize
 | |
|     call .32pt
 | |
| 
 | |
|     add outq, 32*mmsize
 | |
|     call .32pt
 | |
| 
 | |
|     POP lenq
 | |
|     sub outq, 96*mmsize
 | |
| 
 | |
|     lea rtabq, [tab_512_float]
 | |
|     lea itabq, [tab_512_float + 512 - 4*7]
 | |
| 
 | |
|     cmp tgtq, 512
 | |
|     je .deinterleave
 | |
| 
 | |
|     mov tmpq, 4
 | |
| 
 | |
| .synth_512:
 | |
|     SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512
 | |
|     add outq, 8*mmsize
 | |
|     add rtabq, 4*mmsize
 | |
|     sub itabq, 4*mmsize
 | |
|     sub tmpq, 1
 | |
|     jg .synth_512
 | |
| 
 | |
|     cmp lenq, 512
 | |
|     jg .1024pt
 | |
|     ret
 | |
| 
 | |
| ; 1024-point transform ==========================================================
 | |
| ALIGN 16
 | |
| .1024pt:
 | |
|     PUSH lenq
 | |
|     mov lenq, 256
 | |
| 
 | |
|     add outq, 96*mmsize
 | |
|     call .32pt
 | |
| 
 | |
|     add outq, 64*mmsize
 | |
|     call .32pt
 | |
| 
 | |
|     POP lenq
 | |
|     sub outq, 192*mmsize
 | |
| 
 | |
|     lea rtabq, [tab_1024_float]
 | |
|     lea itabq, [tab_1024_float + 1024 - 4*7]
 | |
| 
 | |
|     cmp tgtq, 1024
 | |
|     je .deinterleave
 | |
| 
 | |
|     mov tmpq, 8
 | |
| 
 | |
| .synth_1024:
 | |
|     SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024
 | |
|     add outq, 8*mmsize
 | |
|     add rtabq, 4*mmsize
 | |
|     sub itabq, 4*mmsize
 | |
|     sub tmpq, 1
 | |
|     jg .synth_1024
 | |
| 
 | |
|     cmp lenq, 1024
 | |
|     jg .2048pt
 | |
|     ret
 | |
| 
 | |
| ; 2048 to 131072-point transforms ==============================================
 | |
| FFT_SPLIT_RADIX_DEF 2048,  .4096pt
 | |
| FFT_SPLIT_RADIX_DEF 4096,  .8192pt
 | |
| FFT_SPLIT_RADIX_DEF 8192,  .16384pt
 | |
| FFT_SPLIT_RADIX_DEF 16384, .32768pt
 | |
| FFT_SPLIT_RADIX_DEF 32768, .65536pt
 | |
| FFT_SPLIT_RADIX_DEF 65536, .131072pt
 | |
| FFT_SPLIT_RADIX_DEF 131072
 | |
| 
 | |
| ;===============================================================================
 | |
| ; Final synthesis + deinterleaving code
 | |
| ;===============================================================================
 | |
| .deinterleave:
 | |
|     cmp lenq, 64
 | |
|     je .64pt_deint
 | |
| 
 | |
|     imul tmpq, lenq, 2
 | |
|     lea lutq, [4*lenq + tmpq]
 | |
| 
 | |
| .synth_deinterleave:
 | |
|     SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, lutq
 | |
|     add outq, 8*mmsize
 | |
|     add rtabq, 4*mmsize
 | |
|     sub itabq, 4*mmsize
 | |
|     sub lenq, 4*mmsize
 | |
|     jg .synth_deinterleave
 | |
| 
 | |
|     RET
 | |
| 
 | |
| ; 64-point deinterleave which only has to load 4 registers =====================
 | |
| .64pt_deint:
 | |
|     SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
 | |
|     SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e
 | |
| 
 | |
|     unpcklpd tmp1, m0, m2
 | |
|     unpcklpd tmp2, m1, m3
 | |
|     unpcklpd tw_o, tx1_e0, tx1_o0
 | |
|     unpcklpd tw_e, tx2_e0, tx2_o0
 | |
|     unpckhpd m0, m0, m2
 | |
|     unpckhpd m1, m1, m3
 | |
|     unpckhpd tx1_e0, tx1_e0, tx1_o0
 | |
|     unpckhpd tx2_e0, tx2_e0, tx2_o0
 | |
| 
 | |
|     vextractf128 [outq +  0*mmsize +  0], tmp1,   0
 | |
|     vextractf128 [outq +  0*mmsize + 16], m0,     0
 | |
|     vextractf128 [outq +  4*mmsize +  0], tmp2,   0
 | |
|     vextractf128 [outq +  4*mmsize + 16], m1,     0
 | |
| 
 | |
|     vextractf128 [outq +  8*mmsize +  0], tw_o,   0
 | |
|     vextractf128 [outq +  8*mmsize + 16], tx1_e0, 0
 | |
|     vextractf128 [outq +  9*mmsize +  0], tw_o,   1
 | |
|     vextractf128 [outq +  9*mmsize + 16], tx1_e0, 1
 | |
| 
 | |
|     vperm2f128 tmp1, tmp1, m0, 0x31
 | |
|     vperm2f128 tmp2, tmp2, m1, 0x31
 | |
| 
 | |
|     vextractf128 [outq + 12*mmsize +  0], tw_e,   0
 | |
|     vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0
 | |
|     vextractf128 [outq + 13*mmsize +  0], tw_e,   1
 | |
|     vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
 | |
| 
 | |
|     movaps tw_e,           [tab_64_float + mmsize]
 | |
|     vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
 | |
| 
 | |
|     movaps m0, [outq +  1*mmsize]
 | |
|     movaps m1, [outq +  3*mmsize]
 | |
|     movaps m2, [outq +  5*mmsize]
 | |
|     movaps m3, [outq +  7*mmsize]
 | |
| 
 | |
|     movaps [outq +  1*mmsize], tmp1
 | |
|     movaps [outq +  5*mmsize], tmp2
 | |
| 
 | |
|     SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
 | |
|                            tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
 | |
| 
 | |
|     unpcklpd tmp1, m0, m1
 | |
|     unpcklpd tmp2, m2, m3
 | |
|     unpcklpd tw_e, tx1_e1, tx1_o1
 | |
|     unpcklpd tw_o, tx2_e1, tx2_o1
 | |
|     unpckhpd m0, m0, m1
 | |
|     unpckhpd m2, m2, m3
 | |
|     unpckhpd tx1_e1, tx1_e1, tx1_o1
 | |
|     unpckhpd tx2_e1, tx2_e1, tx2_o1
 | |
| 
 | |
|     vextractf128 [outq +  2*mmsize +  0], tmp1,   0
 | |
|     vextractf128 [outq +  2*mmsize + 16], m0,     0
 | |
|     vextractf128 [outq +  3*mmsize +  0], tmp1,   1
 | |
|     vextractf128 [outq +  3*mmsize + 16], m0,     1
 | |
| 
 | |
|     vextractf128 [outq +  6*mmsize +  0], tmp2,   0
 | |
|     vextractf128 [outq +  6*mmsize + 16], m2,     0
 | |
|     vextractf128 [outq +  7*mmsize +  0], tmp2,   1
 | |
|     vextractf128 [outq +  7*mmsize + 16], m2,     1
 | |
| 
 | |
|     vextractf128 [outq + 10*mmsize +  0], tw_e,   0
 | |
|     vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0
 | |
|     vextractf128 [outq + 11*mmsize +  0], tw_e,   1
 | |
|     vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1
 | |
| 
 | |
|     vextractf128 [outq + 14*mmsize +  0], tw_o,   0
 | |
|     vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0
 | |
|     vextractf128 [outq + 15*mmsize +  0], tw_o,   1
 | |
|     vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
 | |
| 
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| %if ARCH_X86_64
 | |
| FFT_SPLIT_RADIX_FN avx,  float,    0
 | |
| FFT_SPLIT_RADIX_FN avx,  ns_float, 1
 | |
| %if HAVE_AVX2_EXTERNAL
 | |
| FFT_SPLIT_RADIX_FN avx2, float,    0
 | |
| FFT_SPLIT_RADIX_FN avx2, ns_float, 1
 | |
| %endif
 | |
| %endif
 |