From 28bff6ae5435b0a6574067de8c129cbe7225c729 Mon Sep 17 00:00:00 2001 From: Lynne Date: Tue, 25 Jan 2022 22:39:09 +0100 Subject: x86/tx_float: add permute-free FFT versions These are used in the PFA transforms and MDCTs. --- libavutil/x86/tx_float.asm | 115 ++++++++++++++++++++++++++++++++++++------ libavutil/x86/tx_float_init.c | 102 +++++++++++++++++++++---------------- 2 files changed, 158 insertions(+), 59 deletions(-) (limited to 'libavutil/x86') diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index e3b48d7c1f..8698e858d0 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -707,13 +707,21 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride FFT4 fwd, 0 FFT4 inv, 1 +%macro FFT8_FN 2 INIT_XMM sse3 -cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp +cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp +%if %2 mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq +%else + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%endif FFT8 m0, m1, m2, m3, m4, m5 @@ -728,12 +736,22 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp movups [outq + 3*mmsize], m1 RET +%endmacro + +FFT8_FN float, 1 +FFT8_FN ns_float, 0 +%macro FFT16_FN 2 INIT_YMM avx -cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp +cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp +%if %2 mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3 +%else + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] +%endif FFT8_AVX m0, m1, m2, m3 @@ -747,16 +765,26 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp vextractf128 [outq + 16*3], m0, 1 RET +%endmacro -%macro FFT16_FN 1 +FFT16_FN float, 1 +FFT16_FN ns_float, 0 + +%macro FFT16_FN 3 INIT_YMM %1 -cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp +cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp +%if %3 + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else mov ctxq, [ctxq + AVTXContext.map] - LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7 +%endif FFT16 m0, m1, m2, m3, m4, m5, m6, m7 @@ -777,25 +805,40 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp RET %endmacro -FFT16_FN avx -FFT16_FN fma3 +FFT16_FN avx, float, 0 +FFT16_FN avx, ns_float, 1 +FFT16_FN fma3, float, 0 +FFT16_FN fma3, ns_float, 1 -%macro FFT32_FN 1 +%macro FFT32_FN 3 INIT_YMM %1 -cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp +cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp +%if %3 + movaps m4, [inq + 4*mmsize] + movaps m5, [inq + 5*mmsize] + movaps m6, [inq + 6*mmsize] + movaps m7, [inq + 7*mmsize] +%else mov ctxq, [ctxq + AVTXContext.map] - LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9 LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11 LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m12, m13 LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m14, m15 +%endif FFT8 m4, m5, m6, m7, m8, m9 +%if %3 + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m9 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m10, m11 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15 +%endif movaps m8, [tab_32_float] vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23 @@ -836,8 +879,10 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp %endmacro %if ARCH_X86_64 -FFT32_FN avx -FFT32_FN fma3 +FFT32_FN avx, float, 0 +FFT32_FN avx, ns_float, 1 +FFT32_FN fma3, float, 0 +FFT32_FN fma3, ns_float, 1 %endif %macro FFT_SPLIT_RADIX_DEF 1-2 @@ -878,9 +923,9 @@ ALIGN 16 %endif %endmacro -%macro FFT_SPLIT_RADIX_FN 1 +%macro FFT_SPLIT_RADIX_FN 3 INIT_YMM %1 -cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt +cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt movsxd lenq, dword [lutq + AVTXContext.len] mov lutq, [lutq + AVTXContext.map] mov tgtq, lenq @@ -888,17 +933,31 @@ cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt ; Bottom-most/32-point transform =============================================== ALIGN 16 .32pt: +%if %3 + movaps m4, [inq + 4*mmsize] + movaps m5, [inq + 5*mmsize] + movaps m6, [inq + 6*mmsize] + movaps m7, [inq + 7*mmsize] +%else LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m9 LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m10, m11 LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m12, m13 LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m14, m15 +%endif FFT8 m4, m5, m6, m7, m8, m9 +%if %3 + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m9 LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m10, m11 LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13 LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15 +%endif movaps m8, [tab_32_float] vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23 @@ -913,7 +972,11 @@ ALIGN 16 movaps [outq + 5*mmsize], m5 movaps [outq + 7*mmsize], m7 +%if %3 + add inq, 8*mmsize +%else add lutq, (mmsize/2)*8 +%endif cmp lenq, 32 jg .64pt @@ -944,24 +1007,42 @@ ALIGN 16 SWAP m4, m1 SWAP m6, m3 +%if %3 + movaps tx1_e0, [inq + 0*mmsize] + movaps tx1_e1, [inq + 1*mmsize] + movaps tx1_o0, [inq + 2*mmsize] + movaps tx1_o1, [inq + 3*mmsize] +%else LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tw_o LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tmp1, tmp2 LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tw_o LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp1, tmp2 +%endif FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1 +%if %3 + movaps tx2_e0, [inq + 4*mmsize] + movaps tx2_e1, [inq + 5*mmsize] + movaps tx2_o0, [inq + 6*mmsize] + movaps tx2_o1, [inq + 7*mmsize] +%else LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1, tmp2 LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_e, tw_o LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tmp1, tmp2 LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e, tw_o +%endif FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o movaps tw_e, [tab_64_float] vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23 +%if %3 + add inq, 8*mmsize +%else add lutq, (mmsize/2)*8 +%endif cmp tgtq, 64 je .deinterleave @@ -1204,8 +1285,10 @@ FFT_SPLIT_RADIX_DEF 131072 %endmacro %if ARCH_X86_64 -FFT_SPLIT_RADIX_FN avx +FFT_SPLIT_RADIX_FN avx, float, 0 +FFT_SPLIT_RADIX_FN avx, ns_float, 1 %if HAVE_AVX2_EXTERNAL -FFT_SPLIT_RADIX_FN avx2 +FFT_SPLIT_RADIX_FN avx2, float, 0 +FFT_SPLIT_RADIX_FN avx2, ns_float, 1 %endif %endif diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 5c9afa75cc..32843b66a4 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -23,6 +23,10 @@ #include "config.h" +/* These versions already do what we need them to do. */ +#define ff_tx_fft2_ns_float_sse3 ff_tx_fft2_float_sse3 +#define ff_tx_fft4_ns_float_sse2 ff_tx_fft4_fwd_float_sse2 + #define DECL_INIT_FN(basis, interleave) \ static av_cold int \ ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86 \ @@ -35,90 +39,102 @@ static av_cold int \ { \ const int inv_lookup = opts ? opts->invert_lookup : 1; \ ff_tx_init_tabs_float(len); \ - return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup, \ - basis, interleave); \ + if (cd->max_len == 2) \ + return ff_tx_gen_ptwo_revtab(s, inv_lookup); \ + else \ + return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup, \ + basis, interleave); \ } #define ff_tx_fft_sr_codelet_init_b0_i0_x86 NULL DECL_INIT_FN(8, 0) DECL_INIT_FN(8, 2) -#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags) \ -void ff_tx_ ##fn_name(AVTXContext *s, void *out, void *in, ptrdiff_t stride); \ -static const FFTXCodelet ff_tx_ ##fn_name## _def = { \ - .name = #fn_name, \ - .function = ff_tx_ ##fn_name, \ - .type = TX_TYPE(FFT), \ - .flags = FF_TX_OUT_OF_PLACE | FF_TX_ALIGNED | fn_flags, \ - .factors[0] = 2, \ - .min_len = len, \ - .max_len = len, \ - .init = ff_tx_fft_sr_codelet_init_ ##init_fn## _x86, \ - .cpu_flags = AV_CPU_FLAG_ ##cpu, \ - .prio = fn_prio, \ +#define DECL_CD_DEF(fn, t, min, max, f1, f2, i, p, c, f) \ +void ff_tx_ ##fn(AVTXContext *s, void *out, void *in, ptrdiff_t stride); \ +static const FFTXCodelet ff_tx_ ##fn## _def = { \ + .name = #fn, \ + .function = ff_tx_ ##fn, \ + .type = TX_TYPE(t), \ + .flags = FF_TX_ALIGNED | f, \ + .factors = { f1, f2 }, \ + .min_len = min, \ + .max_len = max, \ + .init = ff_tx_ ##i## _x86, \ + .cpu_flags = c, \ + .prio = p, \ }; +#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags) \ + DECL_CD_DEF(fn_name, FFT, len, len, 2, 0, \ + fft_sr_codelet_init_ ##init_fn, fn_prio, \ + AV_CPU_FLAG_ ##cpu, fn_flags) \ + DECL_SR_CD_DEF(fft2_float_sse3, 2, b0_i0, 128, SSE3, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft2_ns_float_sse3, 2, b8_i0, 192, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE) DECL_SR_CD_DEF(fft4_fwd_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY) DECL_SR_CD_DEF(fft4_inv_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY) +DECL_SR_CD_DEF(fft4_ns_float_sse2, 4, b8_i0, 192, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE) DECL_SR_CD_DEF(fft8_float_sse3, 8, b8_i0, 128, SSE3, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft8_ns_float_sse3, 8, b8_i0, 192, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE) DECL_SR_CD_DEF(fft8_float_avx, 8, b8_i0, 256, AVX, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft8_ns_float_avx, 8, b8_i0, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE) DECL_SR_CD_DEF(fft16_float_avx, 16, b8_i2, 256, AVX, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft16_ns_float_avx, 16, b8_i2, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE) DECL_SR_CD_DEF(fft16_float_fma3, 16, b8_i2, 288, FMA3, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft16_ns_float_fma3, 16, b8_i2, 352, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE) #if ARCH_X86_64 DECL_SR_CD_DEF(fft32_float_avx, 32, b8_i2, 256, AVX, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft32_ns_float_avx, 32, b8_i2, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE) DECL_SR_CD_DEF(fft32_float_fma3, 32, b8_i2, 288, FMA3, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft32_ns_float_fma3, 32, b8_i2, 352, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE) -void ff_tx_fft_sr_float_avx(AVTXContext *s, void *out, void *in, ptrdiff_t stride); -const FFTXCodelet ff_tx_fft_sr_float_avx_def = { - .name = "fft_sr_float_avx", - .function = ff_tx_fft_sr_float_avx, - .type = TX_TYPE(FFT), - .flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE, - .factors[0] = 2, - .min_len = 64, - .max_len = 131072, - .init = ff_tx_fft_sr_codelet_init_b8_i2_x86, - .cpu_flags = AV_CPU_FLAG_AVX, - .prio = 256, -}; +DECL_CD_DEF(fft_sr_float_avx, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2, + 256, AV_CPU_FLAG_AVX, + FF_TX_OUT_OF_PLACE) + +DECL_CD_DEF(fft_sr_ns_float_avx, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2, + 320, AV_CPU_FLAG_AVX, + FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | FF_TX_PRESHUFFLE) #if HAVE_AVX2_EXTERNAL -void ff_tx_fft_sr_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride); -const FFTXCodelet ff_tx_fft_sr_float_avx2_def = { - .name = "fft_sr_float_avx2", - .function = ff_tx_fft_sr_float_avx2, - .type = TX_TYPE(FFT), - .flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE, - .factors[0] = 2, - .min_len = 64, - .max_len = 131072, - .init = ff_tx_fft_sr_codelet_init_b8_i2_x86, - .cpu_flags = AV_CPU_FLAG_AVX2, - .prio = 288, -}; +DECL_CD_DEF(fft_sr_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2, + 288, AV_CPU_FLAG_AVX2, + FF_TX_OUT_OF_PLACE) + +DECL_CD_DEF(fft_sr_ns_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2, + 352, AV_CPU_FLAG_AVX2, + FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | FF_TX_PRESHUFFLE) #endif #endif const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { - /* Split-Radix codelets */ &ff_tx_fft2_float_sse3_def, + &ff_tx_fft2_ns_float_sse3_def, &ff_tx_fft4_fwd_float_sse2_def, &ff_tx_fft4_inv_float_sse2_def, + &ff_tx_fft4_ns_float_sse2_def, &ff_tx_fft8_float_sse3_def, + &ff_tx_fft8_ns_float_sse3_def, &ff_tx_fft8_float_avx_def, + &ff_tx_fft8_ns_float_avx_def, &ff_tx_fft16_float_avx_def, + &ff_tx_fft16_ns_float_avx_def, &ff_tx_fft16_float_fma3_def, + &ff_tx_fft16_ns_float_fma3_def, #if ARCH_X86_64 &ff_tx_fft32_float_avx_def, + &ff_tx_fft32_ns_float_avx_def, &ff_tx_fft32_float_fma3_def, + &ff_tx_fft32_ns_float_fma3_def, - /* Standalone transforms */ &ff_tx_fft_sr_float_avx_def, + &ff_tx_fft_sr_ns_float_avx_def, #if HAVE_AVX2_EXTERNAL &ff_tx_fft_sr_float_avx2_def, + &ff_tx_fft_sr_ns_float_avx2_def, #endif #endif -- cgit v1.2.3