mirror of
https://github.com/nyanmisaka/ffmpeg-rockchip.git
synced 2025-12-24 11:52:06 +08:00
x86/tx_float: add permute-free FFT versions
These are used in the PFA transforms and MDCTs.
This commit is contained in:
@@ -707,13 +707,21 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
|
||||
FFT4 fwd, 0
|
||||
FFT4 inv, 1
|
||||
|
||||
%macro FFT8_FN 2
|
||||
INIT_XMM sse3
|
||||
cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
|
||||
cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
|
||||
%if %2
|
||||
mov ctxq, [ctxq + AVTXContext.map]
|
||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
|
||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
|
||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
|
||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
|
||||
%else
|
||||
movaps m0, [inq + 0*mmsize]
|
||||
movaps m1, [inq + 1*mmsize]
|
||||
movaps m2, [inq + 2*mmsize]
|
||||
movaps m3, [inq + 3*mmsize]
|
||||
%endif
|
||||
|
||||
FFT8 m0, m1, m2, m3, m4, m5
|
||||
|
||||
@@ -728,12 +736,22 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
|
||||
movups [outq + 3*mmsize], m1
|
||||
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
FFT8_FN float, 1
|
||||
FFT8_FN ns_float, 0
|
||||
|
||||
%macro FFT16_FN 2
|
||||
INIT_YMM avx
|
||||
cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
|
||||
cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
|
||||
%if %2
|
||||
mov ctxq, [ctxq + AVTXContext.map]
|
||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
|
||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
|
||||
%else
|
||||
movaps m0, [inq + 0*mmsize]
|
||||
movaps m1, [inq + 1*mmsize]
|
||||
%endif
|
||||
|
||||
FFT8_AVX m0, m1, m2, m3
|
||||
|
||||
@@ -747,16 +765,26 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
|
||||
vextractf128 [outq + 16*3], m0, 1
|
||||
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro FFT16_FN 1
|
||||
FFT16_FN float, 1
|
||||
FFT16_FN ns_float, 0
|
||||
|
||||
%macro FFT16_FN 3
|
||||
INIT_YMM %1
|
||||
cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
|
||||
cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
|
||||
%if %3
|
||||
movaps m0, [inq + 0*mmsize]
|
||||
movaps m1, [inq + 1*mmsize]
|
||||
movaps m2, [inq + 2*mmsize]
|
||||
movaps m3, [inq + 3*mmsize]
|
||||
%else
|
||||
mov ctxq, [ctxq + AVTXContext.map]
|
||||
|
||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
|
||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
|
||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
|
||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
|
||||
%endif
|
||||
|
||||
FFT16 m0, m1, m2, m3, m4, m5, m6, m7
|
||||
|
||||
@@ -777,25 +805,40 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
FFT16_FN avx
|
||||
FFT16_FN fma3
|
||||
FFT16_FN avx, float, 0
|
||||
FFT16_FN avx, ns_float, 1
|
||||
FFT16_FN fma3, float, 0
|
||||
FFT16_FN fma3, ns_float, 1
|
||||
|
||||
%macro FFT32_FN 1
|
||||
%macro FFT32_FN 3
|
||||
INIT_YMM %1
|
||||
cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
|
||||
cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
|
||||
%if %3
|
||||
movaps m4, [inq + 4*mmsize]
|
||||
movaps m5, [inq + 5*mmsize]
|
||||
movaps m6, [inq + 6*mmsize]
|
||||
movaps m7, [inq + 7*mmsize]
|
||||
%else
|
||||
mov ctxq, [ctxq + AVTXContext.map]
|
||||
|
||||
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9
|
||||
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
|
||||
LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m12, m13
|
||||
LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
FFT8 m4, m5, m6, m7, m8, m9
|
||||
|
||||
%if %3
|
||||
movaps m0, [inq + 0*mmsize]
|
||||
movaps m1, [inq + 1*mmsize]
|
||||
movaps m2, [inq + 2*mmsize]
|
||||
movaps m3, [inq + 3*mmsize]
|
||||
%else
|
||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m9
|
||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m10, m11
|
||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
|
||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
movaps m8, [tab_32_float]
|
||||
vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
|
||||
@@ -836,8 +879,10 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
FFT32_FN avx
|
||||
FFT32_FN fma3
|
||||
FFT32_FN avx, float, 0
|
||||
FFT32_FN avx, ns_float, 1
|
||||
FFT32_FN fma3, float, 0
|
||||
FFT32_FN fma3, ns_float, 1
|
||||
%endif
|
||||
|
||||
%macro FFT_SPLIT_RADIX_DEF 1-2
|
||||
@@ -878,9 +923,9 @@ ALIGN 16
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro FFT_SPLIT_RADIX_FN 1
|
||||
%macro FFT_SPLIT_RADIX_FN 3
|
||||
INIT_YMM %1
|
||||
cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
||||
cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
||||
movsxd lenq, dword [lutq + AVTXContext.len]
|
||||
mov lutq, [lutq + AVTXContext.map]
|
||||
mov tgtq, lenq
|
||||
@@ -888,17 +933,31 @@ cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
||||
; Bottom-most/32-point transform ===============================================
|
||||
ALIGN 16
|
||||
.32pt:
|
||||
%if %3
|
||||
movaps m4, [inq + 4*mmsize]
|
||||
movaps m5, [inq + 5*mmsize]
|
||||
movaps m6, [inq + 6*mmsize]
|
||||
movaps m7, [inq + 7*mmsize]
|
||||
%else
|
||||
LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m9
|
||||
LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m10, m11
|
||||
LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m12, m13
|
||||
LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
FFT8 m4, m5, m6, m7, m8, m9
|
||||
|
||||
%if %3
|
||||
movaps m0, [inq + 0*mmsize]
|
||||
movaps m1, [inq + 1*mmsize]
|
||||
movaps m2, [inq + 2*mmsize]
|
||||
movaps m3, [inq + 3*mmsize]
|
||||
%else
|
||||
LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m9
|
||||
LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m10, m11
|
||||
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
|
||||
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
movaps m8, [tab_32_float]
|
||||
vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
|
||||
@@ -913,7 +972,11 @@ ALIGN 16
|
||||
movaps [outq + 5*mmsize], m5
|
||||
movaps [outq + 7*mmsize], m7
|
||||
|
||||
%if %3
|
||||
add inq, 8*mmsize
|
||||
%else
|
||||
add lutq, (mmsize/2)*8
|
||||
%endif
|
||||
cmp lenq, 32
|
||||
jg .64pt
|
||||
|
||||
@@ -944,24 +1007,42 @@ ALIGN 16
|
||||
SWAP m4, m1
|
||||
SWAP m6, m3
|
||||
|
||||
%if %3
|
||||
movaps tx1_e0, [inq + 0*mmsize]
|
||||
movaps tx1_e1, [inq + 1*mmsize]
|
||||
movaps tx1_o0, [inq + 2*mmsize]
|
||||
movaps tx1_o1, [inq + 3*mmsize]
|
||||
%else
|
||||
LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tw_o
|
||||
LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tmp1, tmp2
|
||||
LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tw_o
|
||||
LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp1, tmp2
|
||||
%endif
|
||||
|
||||
FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
|
||||
|
||||
%if %3
|
||||
movaps tx2_e0, [inq + 4*mmsize]
|
||||
movaps tx2_e1, [inq + 5*mmsize]
|
||||
movaps tx2_o0, [inq + 6*mmsize]
|
||||
movaps tx2_o1, [inq + 7*mmsize]
|
||||
%else
|
||||
LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1, tmp2
|
||||
LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_e, tw_o
|
||||
LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tmp1, tmp2
|
||||
LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e, tw_o
|
||||
%endif
|
||||
|
||||
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
|
||||
|
||||
movaps tw_e, [tab_64_float]
|
||||
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
|
||||
|
||||
%if %3
|
||||
add inq, 8*mmsize
|
||||
%else
|
||||
add lutq, (mmsize/2)*8
|
||||
%endif
|
||||
cmp tgtq, 64
|
||||
je .deinterleave
|
||||
|
||||
@@ -1204,8 +1285,10 @@ FFT_SPLIT_RADIX_DEF 131072
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
FFT_SPLIT_RADIX_FN avx
|
||||
FFT_SPLIT_RADIX_FN avx, float, 0
|
||||
FFT_SPLIT_RADIX_FN avx, ns_float, 1
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
FFT_SPLIT_RADIX_FN avx2
|
||||
FFT_SPLIT_RADIX_FN avx2, float, 0
|
||||
FFT_SPLIT_RADIX_FN avx2, ns_float, 1
|
||||
%endif
|
||||
%endif
|
||||
|
||||
Reference in New Issue
Block a user