diff options
author | Lynne <dev@lynne.ee> | 2022-01-20 07:14:46 +0100 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-01-26 04:12:44 +0100 |
commit | ef4bd8161575a79f0ac247ad0aa2f05b8c20052b (patch) | |
tree | cf8488b2f2e9b0b88dd04b511113289d79852486 /libavutil/x86 | |
parent | c14976be045f3fe658c12d7e30946cdb380452ec (diff) |
lavu/tx: rewrite internal code as a tree-based codelet constructor
This commit rewrites the internal transform code into a constructor
that stitches transforms (codelets).
This allows for transforms to reuse arbitrary parts of other
transforms, and allows transforms to be stacked onto one
another (such as a full iMDCT using a half-iMDCT which in turn
uses an FFT). It also permits for each step to be individually
replaced by assembly or a custom implementation (such as an ASIC).
Diffstat (limited to 'libavutil/x86')
-rw-r--r-- | libavutil/x86/tx_float.asm | 85 | ||||
-rw-r--r-- | libavutil/x86/tx_float_init.c | 160 |
2 files changed, 131 insertions, 114 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index 4d2283fae1..e3b48d7c1f 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -31,6 +31,8 @@ %include "x86util.asm" +%define private_prefix ff_tx + %if ARCH_X86_64 %define ptr resq %else @@ -39,25 +41,22 @@ %assign i 16 %rep 14 -cextern cos_ %+ i %+ _float ; ff_cos_i_float... +cextern tab_ %+ i %+ _float ; ff_tab_i_float... %assign i (i << 1) %endrep struc AVTXContext - .n: resd 1 ; Non-power-of-two part - .m: resd 1 ; Power-of-two part - .inv: resd 1 ; Is inverse - .type: resd 1 ; Type - .flags: resq 1 ; Flags - .scale: resq 1 ; Scale - - .exptab: ptr 1 ; MDCT exptab - .tmp: ptr 1 ; Temporary buffer needed for all compound transforms - .pfatab: ptr 1 ; Input/Output mapping for compound transforms - .revtab: ptr 1 ; Input mapping for power of two transforms - .inplace_idx: ptr 1 ; Required indices to revtab for in-place transforms - - .top_tx ptr 1 ; Used for transforms derived from other transforms + .len: resd 1 ; Length + .inv resd 1 ; Inverse flag + .map: ptr 1 ; Lookup table(s) + .exp: ptr 1 ; Exponentiation factors + .tmp: ptr 1 ; Temporary data + + .sub: ptr 1 ; Subcontexts + .fn: ptr 4 ; Subcontext functions + .nb_sub: resd 1 ; Subcontext count + + ; Everything else is inaccessible endstruc SECTION_RODATA 32 @@ -485,8 +484,8 @@ SECTION .text movaps [outq + 10*mmsize], tx1_o0 movaps [outq + 14*mmsize], tx2_o0 - movaps tw_e, [cos_64_float + mmsize] - vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23 + movaps tw_e, [tab_64_float + mmsize] + vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 movaps m0, [outq + 1*mmsize] movaps m1, [outq + 3*mmsize] @@ -710,8 +709,7 @@ FFT4 inv, 1 INIT_XMM sse3 cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp - mov ctxq, [ctxq + AVTXContext.revtab] - + mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq @@ -733,8 +731,7 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp INIT_YMM avx cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp - mov ctxq, [ctxq + AVTXContext.revtab] - + mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3 @@ -754,7 +751,7 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp %macro FFT16_FN 1 INIT_YMM %1 cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp - mov ctxq, [ctxq + AVTXContext.revtab] + mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5 @@ -786,7 +783,7 @@ FFT16_FN fma3 %macro FFT32_FN 1 INIT_YMM %1 cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp - mov ctxq, [ctxq + AVTXContext.revtab] + mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9 LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11 @@ -800,8 +797,8 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15 - movaps m8, [cos_32_float] - vperm2f128 m9, m9, [cos_32_float + 4*8 - 4*7], 0x23 + movaps m8, [tab_32_float] + vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23 FFT16 m0, m1, m2, m3, m10, m11, m12, m13 @@ -858,8 +855,8 @@ ALIGN 16 POP lenq sub outq, (%1*4) + (%1*2) + (%1/2) - lea rtabq, [cos_ %+ %1 %+ _float] - lea itabq, [cos_ %+ %1 %+ _float + %1 - 4*7] + lea rtabq, [tab_ %+ %1 %+ _float] + lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7] %if %0 > 1 cmp tgtq, %1 @@ -883,9 +880,9 @@ ALIGN 16 %macro FFT_SPLIT_RADIX_FN 1 INIT_YMM %1 -cglobal split_radix_fft_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt - movsxd lenq, dword [lutq + AVTXContext.m] - mov lutq, [lutq + AVTXContext.revtab] +cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt + movsxd lenq, dword [lutq + AVTXContext.len] + mov lutq, [lutq + AVTXContext.map] mov tgtq, lenq ; Bottom-most/32-point transform =============================================== @@ -903,8 +900,8 @@ ALIGN 16 LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13 LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15 - movaps m8, [cos_32_float] - vperm2f128 m9, m9, [cos_32_float + 32 - 4*7], 0x23 + movaps m8, [tab_32_float] + vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23 FFT16 m0, m1, m2, m3, m10, m11, m12, m13 @@ -961,8 +958,8 @@ ALIGN 16 FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o - movaps tw_e, [cos_64_float] - vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7], 0x23 + movaps tw_e, [tab_64_float] + vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23 add lutq, (mmsize/2)*8 cmp tgtq, 64 @@ -989,8 +986,8 @@ ALIGN 16 POP lenq sub outq, 24*mmsize - lea rtabq, [cos_128_float] - lea itabq, [cos_128_float + 128 - 4*7] + lea rtabq, [tab_128_float] + lea itabq, [tab_128_float + 128 - 4*7] cmp tgtq, 128 je .deinterleave @@ -1016,8 +1013,8 @@ ALIGN 16 POP lenq sub outq, 48*mmsize - lea rtabq, [cos_256_float] - lea itabq, [cos_256_float + 256 - 4*7] + lea rtabq, [tab_256_float] + lea itabq, [tab_256_float + 256 - 4*7] cmp tgtq, 256 je .deinterleave @@ -1044,8 +1041,8 @@ ALIGN 16 POP lenq sub outq, 96*mmsize - lea rtabq, [cos_512_float] - lea itabq, [cos_512_float + 512 - 4*7] + lea rtabq, [tab_512_float] + lea itabq, [tab_512_float + 512 - 4*7] cmp tgtq, 512 je .deinterleave @@ -1079,8 +1076,8 @@ ALIGN 16 POP lenq sub outq, 192*mmsize - lea rtabq, [cos_1024_float] - lea itabq, [cos_1024_float + 1024 - 4*7] + lea rtabq, [tab_1024_float] + lea itabq, [tab_1024_float + 1024 - 4*7] cmp tgtq, 1024 je .deinterleave @@ -1160,8 +1157,8 @@ FFT_SPLIT_RADIX_DEF 131072 vextractf128 [outq + 13*mmsize + 0], tw_e, 1 vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1 - movaps tw_e, [cos_64_float + mmsize] - vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23 + movaps tw_e, [tab_64_float + mmsize] + vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 movaps m0, [outq + 1*mmsize] movaps m1, [outq + 3*mmsize] diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 8b77a5f29f..5c9afa75cc 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -21,86 +21,106 @@ #include "libavutil/attributes.h" #include "libavutil/x86/cpu.h" -void ff_fft2_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_fft4_inv_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_fft4_fwd_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_fft8_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_fft8_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_fft16_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_fft16_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_fft32_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_fft32_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); +#include "config.h" -void ff_split_radix_fft_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_split_radix_fft_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride); - -av_cold void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx) -{ - int cpu_flags = av_get_cpu_flags(); - int gen_revtab = 0, basis, revtab_interleave; +#define DECL_INIT_FN(basis, interleave) \ +static av_cold int \ + ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86 \ + (AVTXContext *s, \ + const FFTXCodelet *cd, \ + uint64_t flags, \ + FFTXCodeletOptions *opts, \ + int len, int inv, \ + const void *scale) \ +{ \ + const int inv_lookup = opts ? opts->invert_lookup : 1; \ + ff_tx_init_tabs_float(len); \ + return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup, \ + basis, interleave); \ +} - if (s->flags & AV_TX_UNALIGNED) - return; +#define ff_tx_fft_sr_codelet_init_b0_i0_x86 NULL +DECL_INIT_FN(8, 0) +DECL_INIT_FN(8, 2) - if (ff_tx_type_is_mdct(s->type)) - return; +#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags) \ +void ff_tx_ ##fn_name(AVTXContext *s, void *out, void *in, ptrdiff_t stride); \ +static const FFTXCodelet ff_tx_ ##fn_name## _def = { \ + .name = #fn_name, \ + .function = ff_tx_ ##fn_name, \ + .type = TX_TYPE(FFT), \ + .flags = FF_TX_OUT_OF_PLACE | FF_TX_ALIGNED | fn_flags, \ + .factors[0] = 2, \ + .min_len = len, \ + .max_len = len, \ + .init = ff_tx_fft_sr_codelet_init_ ##init_fn## _x86, \ + .cpu_flags = AV_CPU_FLAG_ ##cpu, \ + .prio = fn_prio, \ +}; -#define TXFN(fn, gentab, sr_basis, interleave) \ - do { \ - *tx = fn; \ - gen_revtab = gentab; \ - basis = sr_basis; \ - revtab_interleave = interleave; \ - } while (0) +DECL_SR_CD_DEF(fft2_float_sse3, 2, b0_i0, 128, SSE3, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft4_fwd_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY) +DECL_SR_CD_DEF(fft4_inv_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY) +DECL_SR_CD_DEF(fft8_float_sse3, 8, b8_i0, 128, SSE3, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft8_float_avx, 8, b8_i0, 256, AVX, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft16_float_avx, 16, b8_i2, 256, AVX, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft16_float_fma3, 16, b8_i2, 288, FMA3, AV_TX_INPLACE) - if (s->n == 1) { - if (EXTERNAL_SSE2(cpu_flags)) { - if (s->m == 4 && s->inv) - TXFN(ff_fft4_inv_float_sse2, 0, 0, 0); - else if (s->m == 4) - TXFN(ff_fft4_fwd_float_sse2, 0, 0, 0); - } +#if ARCH_X86_64 +DECL_SR_CD_DEF(fft32_float_avx, 32, b8_i2, 256, AVX, AV_TX_INPLACE) +DECL_SR_CD_DEF(fft32_float_fma3, 32, b8_i2, 288, FMA3, AV_TX_INPLACE) - if (EXTERNAL_SSE3(cpu_flags)) { - if (s->m == 2) - TXFN(ff_fft2_float_sse3, 0, 0, 0); - else if (s->m == 8) - TXFN(ff_fft8_float_sse3, 1, 8, 0); - } +void ff_tx_fft_sr_float_avx(AVTXContext *s, void *out, void *in, ptrdiff_t stride); +const FFTXCodelet ff_tx_fft_sr_float_avx_def = { + .name = "fft_sr_float_avx", + .function = ff_tx_fft_sr_float_avx, + .type = TX_TYPE(FFT), + .flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE, + .factors[0] = 2, + .min_len = 64, + .max_len = 131072, + .init = ff_tx_fft_sr_codelet_init_b8_i2_x86, + .cpu_flags = AV_CPU_FLAG_AVX, + .prio = 256, +}; - if (EXTERNAL_AVX_FAST(cpu_flags)) { - if (s->m == 8) - TXFN(ff_fft8_float_avx, 1, 8, 0); - else if (s->m == 16) - TXFN(ff_fft16_float_avx, 1, 8, 2); -#if ARCH_X86_64 - else if (s->m == 32) - TXFN(ff_fft32_float_avx, 1, 8, 2); - else if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE)) - TXFN(ff_split_radix_fft_float_avx, 1, 8, 2); +#if HAVE_AVX2_EXTERNAL +void ff_tx_fft_sr_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride); +const FFTXCodelet ff_tx_fft_sr_float_avx2_def = { + .name = "fft_sr_float_avx2", + .function = ff_tx_fft_sr_float_avx2, + .type = TX_TYPE(FFT), + .flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE, + .factors[0] = 2, + .min_len = 64, + .max_len = 131072, + .init = ff_tx_fft_sr_codelet_init_b8_i2_x86, + .cpu_flags = AV_CPU_FLAG_AVX2, + .prio = 288, +}; #endif - } - - if (EXTERNAL_FMA3_FAST(cpu_flags)) { - if (s->m == 16) - TXFN(ff_fft16_float_fma3, 1, 8, 2); -#if ARCH_X86_64 - else if (s->m == 32) - TXFN(ff_fft32_float_fma3, 1, 8, 2); #endif - } + +const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { + /* Split-Radix codelets */ + &ff_tx_fft2_float_sse3_def, + &ff_tx_fft4_fwd_float_sse2_def, + &ff_tx_fft4_inv_float_sse2_def, + &ff_tx_fft8_float_sse3_def, + &ff_tx_fft8_float_avx_def, + &ff_tx_fft16_float_avx_def, + &ff_tx_fft16_float_fma3_def, #if ARCH_X86_64 - if (EXTERNAL_AVX2_FAST(cpu_flags)) { - if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE)) - TXFN(ff_split_radix_fft_float_avx2, 1, 8, 2); - } -#endif - } + &ff_tx_fft32_float_avx_def, + &ff_tx_fft32_float_fma3_def, - if (gen_revtab) - ff_tx_gen_split_radix_parity_revtab(s->revtab, s->m, s->inv, basis, - revtab_interleave); + /* Standalone transforms */ + &ff_tx_fft_sr_float_avx_def, +#if HAVE_AVX2_EXTERNAL + &ff_tx_fft_sr_float_avx2_def, +#endif +#endif -#undef TXFN -} + NULL, +}; |