summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2022-08-16 01:11:40 +0200
committerLynne <dev@lynne.ee>2022-08-16 01:22:38 +0200
commitae66a9db7bc19f00daaad96b3c15cbffe6280a93 (patch)
tree1c044b8f6d388b624308d8f1be559dfc9ded6708
parent412922cc6fa790897ef6bb2be5d6f9a5f030754d (diff)
lavu/tx: optimize and simplify inverse MDCTs
Convert the input from a scatter to a gather instead, which is faster and better for SIMD. Also, add a pre-shuffled exptab version to avoid gathering there at all. This doubles the exptab size, but the speedup makes it worth it. In SIMD, the exptab will likely be purged to a higher cache anyway because of the FFT in the middle, and the amount of loads stays identical. For a 960-point inverse MDCT, the speedup is 10%. This makes it possible to write sane and fast SIMD versions of inverse MDCTs.
-rw-r--r--libavutil/tx.c4
-rw-r--r--libavutil/tx_priv.h11
-rw-r--r--libavutil/tx_template.c50
3 files changed, 45 insertions, 20 deletions
diff --git a/libavutil/tx.c b/libavutil/tx.c
index 4cc3a98751..e6fcf9f451 100644
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@@ -44,7 +44,6 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
int *in_map, *out_map;
const int inv = s->inv;
const int len = n*m; /* Will not be equal to s->len for MDCTs */
- const int mdct = TYPE_IS(MDCT, s->type);
int m_inv, n_inv;
/* Make sure the numbers are coprime */
@@ -63,8 +62,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
/* Ruritanian map for input, CRT map for output, can be swapped */
for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
- /* Shifted by 1 to simplify MDCTs */
- in_map[j*n + i] = ((i*m + j*n) % len) << mdct;
+ in_map[j*n + i] = (i*m + j*n) % len;
out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
}
}
diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h
index c9eda44e61..e6b0326cd9 100644
--- a/libavutil/tx_priv.h
+++ b/libavutil/tx_priv.h
@@ -297,10 +297,13 @@ void ff_tx_init_tabs_float (int len);
void ff_tx_init_tabs_double(int len);
void ff_tx_init_tabs_int32 (int len);
-/* Typed init function to initialize an MDCT exptab in a context. */
-int ff_tx_mdct_gen_exp_float (AVTXContext *s);
-int ff_tx_mdct_gen_exp_double(AVTXContext *s);
-int ff_tx_mdct_gen_exp_int32 (AVTXContext *s);
+/* Typed init function to initialize an MDCT exptab in a context.
+ * If pre_tab is set, duplicates the entire table, with the first
+ * copy being shuffled according to pre_tab, and the second copy
+ * being the original. */
+int ff_tx_mdct_gen_exp_float (AVTXContext *s, int *pre_tab);
+int ff_tx_mdct_gen_exp_double(AVTXContext *s, int *pre_tab);
+int ff_tx_mdct_gen_exp_int32 (AVTXContext *s, int *pre_tab);
/* Lists of codelets */
extern const FFTXCodelet * const ff_tx_codelet_list_float_c [];
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index 1e4354580b..35b61fa477 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -948,7 +948,7 @@ static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s,
const void *scale)
{
int ret;
- FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
+ FFTXCodeletOptions sub_opts = { .invert_lookup = inv };
s->scale_d = *((SCALE_TYPE *)scale);
s->scale_f = s->scale_d;
@@ -961,9 +961,14 @@ static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s,
inv, scale)))
return ret;
- if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s)))
+ if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->sub->map : NULL)))
return ret;
+ /* Saves a multiply in a hot path. */
+ if (inv)
+ for (int i = 0; i < (s->len >> 1); i++)
+ s->sub->map[i] <<= 1;
+
return 0;
}
@@ -1020,12 +1025,14 @@ static void TX_NAME(ff_tx_mdct_sr_inv)(AVTXContext *s, void *_dst, void *_src,
in2 = src + ((len2*2) - 1) * stride;
for (int i = 0; i < len2; i++) {
- TXComplex tmp = { in2[-2*i*stride], in1[2*i*stride] };
- CMUL3(z[sub_map[i]], tmp, exp[i]);
+ int k = sub_map[i];
+ TXComplex tmp = { in2[-k*stride], in1[k*stride] };
+ CMUL3(z[i], tmp, exp[i]);
}
s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
+ exp += len2;
for (int i = 0; i < len4; i++) {
const int i0 = len4 + i, i1 = len4 - i - 1;
TXComplex src1 = { z[i1].im, z[i1].re };
@@ -1141,9 +1148,13 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
return ret;
- if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s)))
+ if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
return ret;
+ /* Saves multiplies in loops. */
+ for (int i = 0; i < len; i++)
+ s->map[i] <<= 1;
+
if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
return AVERROR(ENOMEM);
@@ -1160,6 +1171,7 @@ static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
TXComplex *z = _dst, *exp = s->exp; \
const TXSample *src = _src, *in1, *in2; \
const int len4 = s->len >> 2; \
+ const int len2 = s->len >> 1; \
const int m = s->sub->len; \
const int *in_map = s->map, *out_map = in_map + N*m; \
const int *sub_map = s->sub->map; \
@@ -1168,13 +1180,15 @@ static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
in1 = src; \
in2 = src + ((N*m*2) - 1) * stride; \
\
- for (int i = 0; i < m; i++) { \
+ for (int i = 0; i < len2; i += N) { \
for (int j = 0; j < N; j++) { \
- const int k = in_map[i*N + j]; \
+ const int k = in_map[j]; \
TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \
- CMUL3(fft##N##in[j], tmp, exp[k >> 1]); \
+ CMUL3(fft##N##in[j], tmp, exp[j]); \
} \
- fft##N(s->tmp + sub_map[i], fft##N##in, m); \
+ fft##N(s->tmp + *(sub_map++), fft##N##in, m); \
+ exp += N; \
+ in_map += N; \
} \
\
for (int i = 0; i < N; i++) \
@@ -1405,22 +1419,32 @@ static const FFTXCodelet TX_NAME(ff_tx_rdft_c2r_def) = {
.prio = FF_TX_PRIO_BASE,
};
-int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s)
+int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
{
+ int off = 0;
int len4 = s->len >> 1;
double scale = s->scale_d;
const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
+ size_t alloc = pre_tab ? 2*len4 : len4;
- if (!(s->exp = av_malloc_array(len4, sizeof(*s->exp))))
+ if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
return AVERROR(ENOMEM);
scale = sqrt(fabs(scale));
+
+ if (pre_tab)
+ off = len4;
+
for (int i = 0; i < len4; i++) {
const double alpha = M_PI_2 * (i + theta) / len4;
- s->exp[i].re = RESCALE(cos(alpha) * scale);
- s->exp[i].im = RESCALE(sin(alpha) * scale);
+ s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
+ RESCALE(sin(alpha) * scale) };
}
+ if (pre_tab)
+ for (int i = 0; i < len4; i++)
+ s->exp[i] = s->exp[len4 + pre_tab[i]];
+
return 0;
}