From 7f019e77586ab62d37856d09071fcdeef880bcd9 Mon Sep 17 00:00:00 2001 From: Lynne Date: Sat, 19 Nov 2022 00:58:13 +0100 Subject: lavu/tx: add length decomposition function Rather than using a list of lengths supported, this goes a step beyond and uses all registered codelets to come up with a good decomposition. --- libavutil/tx.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++ libavutil/tx_priv.h | 8 +++ 2 files changed, 151 insertions(+) (limited to 'libavutil') diff --git a/libavutil/tx.c b/libavutil/tx.c index 8027e983ba..c692239656 100644 --- a/libavutil/tx.c +++ b/libavutil/tx.c @@ -17,6 +17,7 @@ */ #include "avassert.h" +#include "intmath.h" #include "cpu.h" #include "qsort.h" #include "bprint.h" @@ -395,6 +396,148 @@ static int get_codelet_prio(const FFTXCodelet *cd, int cpu_flags, int len) return prio; } +typedef struct FFTXLenDecomp { + int len; + int len2; + int prio; + const FFTXCodelet *cd; +} FFTXLenDecomp; + +static int cmp_decomp(FFTXLenDecomp *a, FFTXLenDecomp *b) +{ + return FFDIFFSIGN(b->prio, a->prio); +} + +int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type, + int len, int inv) +{ + int nb_decomp = 0; + FFTXLenDecomp ld[TX_MAX_DECOMPOSITIONS]; + int codelet_list_idx = codelet_list_num; + + const int cpu_flags = av_get_cpu_flags(); + + /* Loop through all codelets in all codelet lists to find matches + * to the requirements */ + while (codelet_list_idx--) { + const FFTXCodelet * const * list = codelet_list[codelet_list_idx]; + const FFTXCodelet *cd = NULL; + + while ((cd = *list++)) { + int fl = len; + int skip = 0, prio; + int factors_product = 1, factors_mod = 0; + + if (nb_decomp >= TX_MAX_DECOMPOSITIONS) + goto sort; + + /* Check if the type matches */ + if (cd->type != TX_TYPE_ANY && type != cd->type) + continue; + + /* Check direction for non-orthogonal codelets */ + if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) || + ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv)) + continue; + + /* Check if the CPU supports the required ISA */ + if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL && + !(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask))) + continue; + + for (int i = 0; i < TX_MAX_FACTORS; i++) { + if (!cd->factors[i] || (fl == 1)) + break; + + if (cd->factors[i] == TX_FACTOR_ANY) { + factors_mod++; + factors_product *= fl; + } else if (!(fl % cd->factors[i])) { + factors_mod++; + if (cd->factors[i] == 2) { + int b = ff_ctz(fl); + fl >>= b; + factors_product <<= b; + } else { + do { + fl /= cd->factors[i]; + factors_product *= cd->factors[i]; + } while (!(fl % cd->factors[i])); + } + } + } + + /* Disqualify if factor requirements are not satisfied or if trivial */ + if ((factors_mod < cd->nb_factors) || (len == factors_product)) + continue; + + if (av_gcd(factors_product, fl) != 1) + continue; + + /* Check if length is supported and factorization was successful */ + if ((factors_product < cd->min_len) || + (cd->max_len != TX_LEN_UNLIMITED && (factors_product > cd->max_len))) + continue; + + prio = get_codelet_prio(cd, cpu_flags, factors_product) * factors_product; + + /* Check for duplicates */ + for (int i = 0; i < nb_decomp; i++) { + if (factors_product == ld[i].len) { + /* Update priority if new one is higher */ + if (prio > ld[i].prio) + ld[i].prio = prio; + skip = 1; + break; + } + } + + /* Add decomposition if unique */ + if (!skip) { + ld[nb_decomp].cd = cd; + ld[nb_decomp].len = factors_product; + ld[nb_decomp].len2 = fl; + ld[nb_decomp].prio = prio; + nb_decomp++; + } + } + } + + if (!nb_decomp) + return AVERROR(EINVAL); + +sort: + AV_QSORT(ld, nb_decomp, FFTXLenDecomp, cmp_decomp); + + for (int i = 0; i < nb_decomp; i++) { + if (ld[i].cd->nb_factors > 1) + dst[i] = ld[i].len2; + else + dst[i] = ld[i].len; + } + + return nb_decomp; +} + +int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts) +{ + s->map = av_malloc(s->len*sizeof(*s->map)); + if (!s->map) + return AVERROR(ENOMEM); + + s->map[0] = 0; /* DC is always at the start */ + if (s->inv) /* Reversing the ACs flips the transform direction */ + for (int i = 1; i < s->len; i++) + s->map[i] = s->len - i; + else + for (int i = 1; i < s->len; i++) + s->map[i] = i; + + s->map_dir = FF_TX_MAP_GATHER; + + return 0; +} + #if !CONFIG_SMALL static void print_flags(AVBPrint *bp, uint64_t f) { diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h index 207f79dfb8..f11061d051 100644 --- a/libavutil/tx_priv.h +++ b/libavutil/tx_priv.h @@ -183,6 +183,9 @@ typedef struct FFTXCodeletOptions { /* Maximum amount of subtransform functions, subtransforms and factors. Arbitrary. */ #define TX_MAX_SUB 4 +/* Maximum number of returned results for ff_tx_decompose_length. Arbitrary. */ +#define TX_MAX_DECOMPOSITIONS 512 + typedef struct FFTXCodelet { const char *name; /* Codelet name, for debugging */ av_tx_fn function; /* Codelet function, != NULL */ @@ -284,6 +287,11 @@ int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, /* Clear the context by freeing all tables, maps and subtransforms. */ void ff_tx_clear_ctx(AVTXContext *s); +/* Attempt to factorize a length into 2 integers such that + * len / dst1 == dst2, where dst1 and dst2 are coprime. */ +int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type, + int len, int inv); + /* Generate a default map (0->len or 0, (len-1)->1 for inverse transforms) * for a context. */ int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts); -- cgit v1.2.3