summaryrefslogtreecommitdiff
path: root/libavutil/x86
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2022-01-20 07:14:46 +0100
committerLynne <dev@lynne.ee>2022-01-26 04:12:44 +0100
commitef4bd8161575a79f0ac247ad0aa2f05b8c20052b (patch)
treecf8488b2f2e9b0b88dd04b511113289d79852486 /libavutil/x86
parentc14976be045f3fe658c12d7e30946cdb380452ec (diff)
lavu/tx: rewrite internal code as a tree-based codelet constructor
This commit rewrites the internal transform code into a constructor that stitches transforms (codelets). This allows for transforms to reuse arbitrary parts of other transforms, and allows transforms to be stacked onto one another (such as a full iMDCT using a half-iMDCT which in turn uses an FFT). It also permits for each step to be individually replaced by assembly or a custom implementation (such as an ASIC).
Diffstat (limited to 'libavutil/x86')
-rw-r--r--libavutil/x86/tx_float.asm85
-rw-r--r--libavutil/x86/tx_float_init.c160
2 files changed, 131 insertions, 114 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index 4d2283fae1..e3b48d7c1f 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -31,6 +31,8 @@
%include "x86util.asm"
+%define private_prefix ff_tx
+
%if ARCH_X86_64
%define ptr resq
%else
@@ -39,25 +41,22 @@
%assign i 16
%rep 14
-cextern cos_ %+ i %+ _float ; ff_cos_i_float...
+cextern tab_ %+ i %+ _float ; ff_tab_i_float...
%assign i (i << 1)
%endrep
struc AVTXContext
- .n: resd 1 ; Non-power-of-two part
- .m: resd 1 ; Power-of-two part
- .inv: resd 1 ; Is inverse
- .type: resd 1 ; Type
- .flags: resq 1 ; Flags
- .scale: resq 1 ; Scale
-
- .exptab: ptr 1 ; MDCT exptab
- .tmp: ptr 1 ; Temporary buffer needed for all compound transforms
- .pfatab: ptr 1 ; Input/Output mapping for compound transforms
- .revtab: ptr 1 ; Input mapping for power of two transforms
- .inplace_idx: ptr 1 ; Required indices to revtab for in-place transforms
-
- .top_tx ptr 1 ; Used for transforms derived from other transforms
+ .len: resd 1 ; Length
+ .inv resd 1 ; Inverse flag
+ .map: ptr 1 ; Lookup table(s)
+ .exp: ptr 1 ; Exponentiation factors
+ .tmp: ptr 1 ; Temporary data
+
+ .sub: ptr 1 ; Subcontexts
+ .fn: ptr 4 ; Subcontext functions
+ .nb_sub: resd 1 ; Subcontext count
+
+ ; Everything else is inaccessible
endstruc
SECTION_RODATA 32
@@ -485,8 +484,8 @@ SECTION .text
movaps [outq + 10*mmsize], tx1_o0
movaps [outq + 14*mmsize], tx2_o0
- movaps tw_e, [cos_64_float + mmsize]
- vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
+ movaps tw_e, [tab_64_float + mmsize]
+ vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
movaps m0, [outq + 1*mmsize]
movaps m1, [outq + 3*mmsize]
@@ -710,8 +709,7 @@ FFT4 inv, 1
INIT_XMM sse3
cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
- mov ctxq, [ctxq + AVTXContext.revtab]
-
+ mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
@@ -733,8 +731,7 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
INIT_YMM avx
cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
- mov ctxq, [ctxq + AVTXContext.revtab]
-
+ mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
@@ -754,7 +751,7 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
%macro FFT16_FN 1
INIT_YMM %1
cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
- mov ctxq, [ctxq + AVTXContext.revtab]
+ mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
@@ -786,7 +783,7 @@ FFT16_FN fma3
%macro FFT32_FN 1
INIT_YMM %1
cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
- mov ctxq, [ctxq + AVTXContext.revtab]
+ mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
@@ -800,8 +797,8 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
- movaps m8, [cos_32_float]
- vperm2f128 m9, m9, [cos_32_float + 4*8 - 4*7], 0x23
+ movaps m8, [tab_32_float]
+ vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
@@ -858,8 +855,8 @@ ALIGN 16
POP lenq
sub outq, (%1*4) + (%1*2) + (%1/2)
- lea rtabq, [cos_ %+ %1 %+ _float]
- lea itabq, [cos_ %+ %1 %+ _float + %1 - 4*7]
+ lea rtabq, [tab_ %+ %1 %+ _float]
+ lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
%if %0 > 1
cmp tgtq, %1
@@ -883,9 +880,9 @@ ALIGN 16
%macro FFT_SPLIT_RADIX_FN 1
INIT_YMM %1
-cglobal split_radix_fft_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
- movsxd lenq, dword [lutq + AVTXContext.m]
- mov lutq, [lutq + AVTXContext.revtab]
+cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
+ movsxd lenq, dword [lutq + AVTXContext.len]
+ mov lutq, [lutq + AVTXContext.map]
mov tgtq, lenq
; Bottom-most/32-point transform ===============================================
@@ -903,8 +900,8 @@ ALIGN 16
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
- movaps m8, [cos_32_float]
- vperm2f128 m9, m9, [cos_32_float + 32 - 4*7], 0x23
+ movaps m8, [tab_32_float]
+ vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
@@ -961,8 +958,8 @@ ALIGN 16
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
- movaps tw_e, [cos_64_float]
- vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7], 0x23
+ movaps tw_e, [tab_64_float]
+ vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
add lutq, (mmsize/2)*8
cmp tgtq, 64
@@ -989,8 +986,8 @@ ALIGN 16
POP lenq
sub outq, 24*mmsize
- lea rtabq, [cos_128_float]
- lea itabq, [cos_128_float + 128 - 4*7]
+ lea rtabq, [tab_128_float]
+ lea itabq, [tab_128_float + 128 - 4*7]
cmp tgtq, 128
je .deinterleave
@@ -1016,8 +1013,8 @@ ALIGN 16
POP lenq
sub outq, 48*mmsize
- lea rtabq, [cos_256_float]
- lea itabq, [cos_256_float + 256 - 4*7]
+ lea rtabq, [tab_256_float]
+ lea itabq, [tab_256_float + 256 - 4*7]
cmp tgtq, 256
je .deinterleave
@@ -1044,8 +1041,8 @@ ALIGN 16
POP lenq
sub outq, 96*mmsize
- lea rtabq, [cos_512_float]
- lea itabq, [cos_512_float + 512 - 4*7]
+ lea rtabq, [tab_512_float]
+ lea itabq, [tab_512_float + 512 - 4*7]
cmp tgtq, 512
je .deinterleave
@@ -1079,8 +1076,8 @@ ALIGN 16
POP lenq
sub outq, 192*mmsize
- lea rtabq, [cos_1024_float]
- lea itabq, [cos_1024_float + 1024 - 4*7]
+ lea rtabq, [tab_1024_float]
+ lea itabq, [tab_1024_float + 1024 - 4*7]
cmp tgtq, 1024
je .deinterleave
@@ -1160,8 +1157,8 @@ FFT_SPLIT_RADIX_DEF 131072
vextractf128 [outq + 13*mmsize + 0], tw_e, 1
vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
- movaps tw_e, [cos_64_float + mmsize]
- vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
+ movaps tw_e, [tab_64_float + mmsize]
+ vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
movaps m0, [outq + 1*mmsize]
movaps m1, [outq + 3*mmsize]
diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c
index 8b77a5f29f..5c9afa75cc 100644
--- a/libavutil/x86/tx_float_init.c
+++ b/libavutil/x86/tx_float_init.c
@@ -21,86 +21,106 @@
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
-void ff_fft2_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_fft4_inv_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_fft4_fwd_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_fft8_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_fft8_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_fft16_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_fft16_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_fft32_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_fft32_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
+#include "config.h"
-void ff_split_radix_fft_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_split_radix_fft_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-
-av_cold void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx)
-{
- int cpu_flags = av_get_cpu_flags();
- int gen_revtab = 0, basis, revtab_interleave;
+#define DECL_INIT_FN(basis, interleave) \
+static av_cold int \
+ ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86 \
+ (AVTXContext *s, \
+ const FFTXCodelet *cd, \
+ uint64_t flags, \
+ FFTXCodeletOptions *opts, \
+ int len, int inv, \
+ const void *scale) \
+{ \
+ const int inv_lookup = opts ? opts->invert_lookup : 1; \
+ ff_tx_init_tabs_float(len); \
+ return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup, \
+ basis, interleave); \
+}
- if (s->flags & AV_TX_UNALIGNED)
- return;
+#define ff_tx_fft_sr_codelet_init_b0_i0_x86 NULL
+DECL_INIT_FN(8, 0)
+DECL_INIT_FN(8, 2)
- if (ff_tx_type_is_mdct(s->type))
- return;
+#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags) \
+void ff_tx_ ##fn_name(AVTXContext *s, void *out, void *in, ptrdiff_t stride); \
+static const FFTXCodelet ff_tx_ ##fn_name## _def = { \
+ .name = #fn_name, \
+ .function = ff_tx_ ##fn_name, \
+ .type = TX_TYPE(FFT), \
+ .flags = FF_TX_OUT_OF_PLACE | FF_TX_ALIGNED | fn_flags, \
+ .factors[0] = 2, \
+ .min_len = len, \
+ .max_len = len, \
+ .init = ff_tx_fft_sr_codelet_init_ ##init_fn## _x86, \
+ .cpu_flags = AV_CPU_FLAG_ ##cpu, \
+ .prio = fn_prio, \
+};
-#define TXFN(fn, gentab, sr_basis, interleave) \
- do { \
- *tx = fn; \
- gen_revtab = gentab; \
- basis = sr_basis; \
- revtab_interleave = interleave; \
- } while (0)
+DECL_SR_CD_DEF(fft2_float_sse3, 2, b0_i0, 128, SSE3, AV_TX_INPLACE)
+DECL_SR_CD_DEF(fft4_fwd_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY)
+DECL_SR_CD_DEF(fft4_inv_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY)
+DECL_SR_CD_DEF(fft8_float_sse3, 8, b8_i0, 128, SSE3, AV_TX_INPLACE)
+DECL_SR_CD_DEF(fft8_float_avx, 8, b8_i0, 256, AVX, AV_TX_INPLACE)
+DECL_SR_CD_DEF(fft16_float_avx, 16, b8_i2, 256, AVX, AV_TX_INPLACE)
+DECL_SR_CD_DEF(fft16_float_fma3, 16, b8_i2, 288, FMA3, AV_TX_INPLACE)
- if (s->n == 1) {
- if (EXTERNAL_SSE2(cpu_flags)) {
- if (s->m == 4 && s->inv)
- TXFN(ff_fft4_inv_float_sse2, 0, 0, 0);
- else if (s->m == 4)
- TXFN(ff_fft4_fwd_float_sse2, 0, 0, 0);
- }
+#if ARCH_X86_64
+DECL_SR_CD_DEF(fft32_float_avx, 32, b8_i2, 256, AVX, AV_TX_INPLACE)
+DECL_SR_CD_DEF(fft32_float_fma3, 32, b8_i2, 288, FMA3, AV_TX_INPLACE)
- if (EXTERNAL_SSE3(cpu_flags)) {
- if (s->m == 2)
- TXFN(ff_fft2_float_sse3, 0, 0, 0);
- else if (s->m == 8)
- TXFN(ff_fft8_float_sse3, 1, 8, 0);
- }
+void ff_tx_fft_sr_float_avx(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
+const FFTXCodelet ff_tx_fft_sr_float_avx_def = {
+ .name = "fft_sr_float_avx",
+ .function = ff_tx_fft_sr_float_avx,
+ .type = TX_TYPE(FFT),
+ .flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
+ .factors[0] = 2,
+ .min_len = 64,
+ .max_len = 131072,
+ .init = ff_tx_fft_sr_codelet_init_b8_i2_x86,
+ .cpu_flags = AV_CPU_FLAG_AVX,
+ .prio = 256,
+};
- if (EXTERNAL_AVX_FAST(cpu_flags)) {
- if (s->m == 8)
- TXFN(ff_fft8_float_avx, 1, 8, 0);
- else if (s->m == 16)
- TXFN(ff_fft16_float_avx, 1, 8, 2);
-#if ARCH_X86_64
- else if (s->m == 32)
- TXFN(ff_fft32_float_avx, 1, 8, 2);
- else if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE))
- TXFN(ff_split_radix_fft_float_avx, 1, 8, 2);
+#if HAVE_AVX2_EXTERNAL
+void ff_tx_fft_sr_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
+const FFTXCodelet ff_tx_fft_sr_float_avx2_def = {
+ .name = "fft_sr_float_avx2",
+ .function = ff_tx_fft_sr_float_avx2,
+ .type = TX_TYPE(FFT),
+ .flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
+ .factors[0] = 2,
+ .min_len = 64,
+ .max_len = 131072,
+ .init = ff_tx_fft_sr_codelet_init_b8_i2_x86,
+ .cpu_flags = AV_CPU_FLAG_AVX2,
+ .prio = 288,
+};
#endif
- }
-
- if (EXTERNAL_FMA3_FAST(cpu_flags)) {
- if (s->m == 16)
- TXFN(ff_fft16_float_fma3, 1, 8, 2);
-#if ARCH_X86_64
- else if (s->m == 32)
- TXFN(ff_fft32_float_fma3, 1, 8, 2);
#endif
- }
+
+const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
+ /* Split-Radix codelets */
+ &ff_tx_fft2_float_sse3_def,
+ &ff_tx_fft4_fwd_float_sse2_def,
+ &ff_tx_fft4_inv_float_sse2_def,
+ &ff_tx_fft8_float_sse3_def,
+ &ff_tx_fft8_float_avx_def,
+ &ff_tx_fft16_float_avx_def,
+ &ff_tx_fft16_float_fma3_def,
#if ARCH_X86_64
- if (EXTERNAL_AVX2_FAST(cpu_flags)) {
- if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE))
- TXFN(ff_split_radix_fft_float_avx2, 1, 8, 2);
- }
-#endif
- }
+ &ff_tx_fft32_float_avx_def,
+ &ff_tx_fft32_float_fma3_def,
- if (gen_revtab)
- ff_tx_gen_split_radix_parity_revtab(s->revtab, s->m, s->inv, basis,
- revtab_interleave);
+ /* Standalone transforms */
+ &ff_tx_fft_sr_float_avx_def,
+#if HAVE_AVX2_EXTERNAL
+ &ff_tx_fft_sr_float_avx2_def,
+#endif
+#endif
-#undef TXFN
-}
+ NULL,
+};