lavu/tx: refactor assembly codelet definition

This commit does some refactoring to make defining assembly codelets
smaller, and fixes compiler redefinition warnings. It also allows
for other assembly versions to reuse the same boilerplate code as
x86.

Finally, it also adds the out_of_place flag to all assembly codelets.
This changes nothing, as out-of-place operation was assumed to be
available anyway, but this makes it more explicit.
This commit is contained in:
Lynne
2022-02-07 03:30:27 +01:00
parent a10f1aec1f
commit 3bbe9c5e38
2 changed files with 71 additions and 93 deletions

View File

@@ -23,19 +23,33 @@
#include "config.h"
/* These versions already do what we need them to do. */
#define ff_tx_fft2_ns_float_sse3 ff_tx_fft2_float_sse3
#define ff_tx_fft4_ns_float_sse2 ff_tx_fft4_fwd_float_sse2
TX_DECL_FN(fft2, sse3)
TX_DECL_FN(fft4_fwd, sse2)
TX_DECL_FN(fft4_inv, sse2)
TX_DECL_FN(fft8, sse3)
TX_DECL_FN(fft8_ns, sse3)
TX_DECL_FN(fft8, avx)
TX_DECL_FN(fft8_ns, avx)
TX_DECL_FN(fft16, avx)
TX_DECL_FN(fft16_ns, avx)
TX_DECL_FN(fft16, fma3)
TX_DECL_FN(fft16_ns, fma3)
TX_DECL_FN(fft32, avx)
TX_DECL_FN(fft32_ns, avx)
TX_DECL_FN(fft32, fma3)
TX_DECL_FN(fft32_ns, fma3)
TX_DECL_FN(fft_sr, avx)
TX_DECL_FN(fft_sr_ns, avx)
TX_DECL_FN(fft_sr, avx2)
TX_DECL_FN(fft_sr_ns, avx2)
#define DECL_INIT_FN(basis, interleave) \
static av_cold int \
ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86 \
(AVTXContext *s, \
const FFTXCodelet *cd, \
uint64_t flags, \
FFTXCodeletOptions *opts, \
int len, int inv, \
const void *scale) \
static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
const FFTXCodelet *cd, \
uint64_t flags, \
FFTXCodeletOptions *opts, \
int len, int inv, \
const void *scale) \
{ \
const int inv_lookup = opts ? opts->invert_lookup : 1; \
ff_tx_init_tabs_float(len); \
@@ -46,95 +60,35 @@ static av_cold int \
basis, interleave); \
}
#define ff_tx_fft_sr_codelet_init_b0_i0_x86 NULL
DECL_INIT_FN(8, 0)
DECL_INIT_FN(8, 2)
#define DECL_CD_DEF(fn, t, min, max, f1, f2, i, p, c, f) \
void ff_tx_ ##fn(AVTXContext *s, void *out, void *in, ptrdiff_t stride); \
static const FFTXCodelet ff_tx_ ##fn## _def = { \
.name = #fn, \
.function = ff_tx_ ##fn, \
.type = TX_TYPE(t), \
.flags = FF_TX_ALIGNED | f, \
.factors = { f1, f2 }, \
.min_len = min, \
.max_len = max, \
.init = ff_tx_ ##i## _x86, \
.cpu_flags = c, \
.prio = p, \
};
#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags) \
DECL_CD_DEF(fn_name, FFT, len, len, 2, 0, \
fft_sr_codelet_init_ ##init_fn, fn_prio, \
AV_CPU_FLAG_ ##cpu, FF_TX_OUT_OF_PLACE | fn_flags)
DECL_SR_CD_DEF(fft2_float_sse3, 2, b0_i0, 128, SSE3, AV_TX_INPLACE)
DECL_SR_CD_DEF(fft2_ns_float_sse3, 2, b8_i0, 192, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
DECL_SR_CD_DEF(fft4_fwd_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY)
DECL_SR_CD_DEF(fft4_inv_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY)
DECL_SR_CD_DEF(fft4_ns_float_sse2, 4, b8_i0, 192, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
DECL_SR_CD_DEF(fft8_float_sse3, 8, b8_i0, 128, SSE3, AV_TX_INPLACE)
DECL_SR_CD_DEF(fft8_ns_float_sse3, 8, b8_i0, 192, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
DECL_SR_CD_DEF(fft8_float_avx, 8, b8_i0, 256, AVX, AV_TX_INPLACE)
DECL_SR_CD_DEF(fft8_ns_float_avx, 8, b8_i0, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
DECL_SR_CD_DEF(fft16_float_avx, 16, b8_i2, 256, AVX, AV_TX_INPLACE)
DECL_SR_CD_DEF(fft16_ns_float_avx, 16, b8_i2, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
DECL_SR_CD_DEF(fft16_float_fma3, 16, b8_i2, 288, FMA3, AV_TX_INPLACE)
DECL_SR_CD_DEF(fft16_ns_float_fma3, 16, b8_i2, 352, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
#if ARCH_X86_64
DECL_SR_CD_DEF(fft32_float_avx, 32, b8_i2, 256, AVX, AV_TX_INPLACE)
DECL_SR_CD_DEF(fft32_ns_float_avx, 32, b8_i2, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
DECL_SR_CD_DEF(fft32_float_fma3, 32, b8_i2, 288, FMA3, AV_TX_INPLACE)
DECL_SR_CD_DEF(fft32_ns_float_fma3, 32, b8_i2, 352, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE)
DECL_CD_DEF(fft_sr_float_avx, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
256, AV_CPU_FLAG_AVX,
FF_TX_OUT_OF_PLACE)
DECL_CD_DEF(fft_sr_ns_float_avx, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
320, AV_CPU_FLAG_AVX,
FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | FF_TX_PRESHUFFLE)
#if HAVE_AVX2_EXTERNAL
DECL_CD_DEF(fft_sr_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
288, AV_CPU_FLAG_AVX2 | AV_CPU_FLAG_AVXSLOW,
FF_TX_OUT_OF_PLACE)
DECL_CD_DEF(fft_sr_ns_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
352, AV_CPU_FLAG_AVX2 | AV_CPU_FLAG_AVXSLOW,
FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | FF_TX_PRESHUFFLE)
#endif
#endif
const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
&ff_tx_fft2_float_sse3_def,
&ff_tx_fft2_ns_float_sse3_def,
&ff_tx_fft4_fwd_float_sse2_def,
&ff_tx_fft4_inv_float_sse2_def,
&ff_tx_fft4_ns_float_sse2_def,
&ff_tx_fft8_float_sse3_def,
&ff_tx_fft8_ns_float_sse3_def,
&ff_tx_fft8_float_avx_def,
&ff_tx_fft8_ns_float_avx_def,
&ff_tx_fft16_float_avx_def,
&ff_tx_fft16_ns_float_avx_def,
&ff_tx_fft16_float_fma3_def,
&ff_tx_fft16_ns_float_fma3_def,
TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0),
TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, 0),
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, 0),
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, 0),
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
#if ARCH_X86_64
&ff_tx_fft32_float_avx_def,
&ff_tx_fft32_ns_float_avx_def,
&ff_tx_fft32_float_fma3_def,
&ff_tx_fft32_ns_float_fma3_def,
&ff_tx_fft_sr_float_avx_def,
&ff_tx_fft_sr_ns_float_avx_def,
TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, 0),
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, 0),
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
#if HAVE_AVX2_EXTERNAL
&ff_tx_fft_sr_float_avx2_def,
&ff_tx_fft_sr_ns_float_avx2_def,
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 256, b8_i2, avx, AVX, 0, 0),
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, avx2, AVX2, 0, AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW),
#endif
#endif