summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2022-11-17 22:14:53 +0100
committerLynne <dev@lynne.ee>2022-11-24 15:58:33 +0100
commit1c8d77a2bfa239621b63c4553c6221560b1ee298 (patch)
tree696e0803c1dc3e415e88c5be8147f0533d571478
parent958b3760b54245b934054f8aa72a608bdb2a48b8 (diff)
lavu/tx: refactor and separate codelet list and prio code
-rw-r--r--libavutil/tx.c125
1 files changed, 66 insertions, 59 deletions
diff --git a/libavutil/tx.c b/libavutil/tx.c
index 319392788f..ff81d235ba 100644
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@@ -300,6 +300,67 @@ static const FFTXCodelet * const ff_tx_null_list[] = {
NULL,
};
+/* Array of all compiled codelet lists. Order is irrelevant. */
+static const FFTXCodelet * const * const codelet_list[] = {
+ ff_tx_codelet_list_float_c,
+ ff_tx_codelet_list_double_c,
+ ff_tx_codelet_list_int32_c,
+ ff_tx_null_list,
+#if HAVE_X86ASM
+ ff_tx_codelet_list_float_x86,
+#endif
+#if ARCH_AARCH64
+ ff_tx_codelet_list_float_aarch64,
+#endif
+};
+static const int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
+
+static const int cpu_slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW |
+ AV_CPU_FLAG_ATOM | AV_CPU_FLAG_SSSE3SLOW |
+ AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER;
+
+static const int cpu_slow_penalties[][2] = {
+ { AV_CPU_FLAG_SSE2SLOW, 1 + 64 },
+ { AV_CPU_FLAG_SSE3SLOW, 1 + 64 },
+ { AV_CPU_FLAG_SSSE3SLOW, 1 + 64 },
+ { AV_CPU_FLAG_ATOM, 1 + 128 },
+ { AV_CPU_FLAG_AVXSLOW, 1 + 128 },
+ { AV_CPU_FLAG_SLOW_GATHER, 1 + 32 },
+};
+
+static int get_codelet_prio(const FFTXCodelet *cd, int cpu_flags, int len)
+{
+ int prio = cd->prio;
+ int max_factor = 0;
+
+ /* If the CPU has a SLOW flag, and the instruction is also flagged
+ * as being slow for such, reduce its priority */
+ for (int i = 0; i < FF_ARRAY_ELEMS(cpu_slow_penalties); i++) {
+ if ((cpu_flags & cd->cpu_flags) & cpu_slow_penalties[i][0])
+ prio -= cpu_slow_penalties[i][1];
+ }
+
+ /* Prioritize aligned-only codelets */
+ if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
+ prio += 64;
+
+ /* Codelets for specific lengths are generally faster */
+ if ((len == cd->min_len) && (len == cd->max_len))
+ prio += 64;
+
+ /* Forward-only or inverse-only transforms are generally better */
+ if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
+ prio += 64;
+
+ /* Larger factors are generally better */
+ for (int i = 0; i < TX_MAX_SUB; i++)
+ max_factor = FFMAX(cd->factors[i], max_factor);
+ if (max_factor)
+ prio += 16*max_factor;
+
+ return prio;
+}
+
#if !CONFIG_SMALL
static void print_flags(AVBPrint *bp, uint64_t f)
{
@@ -465,41 +526,15 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
AVTXContext *sub = NULL;
TXCodeletMatch *cd_tmp, *cd_matches = NULL;
unsigned int cd_matches_size = 0;
+ int codelet_list_idx = codelet_list_num;
int nb_cd_matches = 0;
#if !CONFIG_SMALL
AVBPrint bp = { 0 };
#endif
- /* Array of all compiled codelet lists. Order is irrelevant. */
- const FFTXCodelet * const * const codelet_list[] = {
- ff_tx_codelet_list_float_c,
- ff_tx_codelet_list_double_c,
- ff_tx_codelet_list_int32_c,
- ff_tx_null_list,
-#if HAVE_X86ASM
- ff_tx_codelet_list_float_x86,
-#endif
-#if ARCH_AARCH64
- ff_tx_codelet_list_float_aarch64,
-#endif
- };
- int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
-
/* We still accept functions marked with SLOW, even if the CPU is
* marked with the same flag, but we give them lower priority. */
const int cpu_flags = av_get_cpu_flags();
- const int slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW |
- AV_CPU_FLAG_ATOM | AV_CPU_FLAG_SSSE3SLOW |
- AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER;
-
- static const int slow_penalties[][2] = {
- { AV_CPU_FLAG_SSE2SLOW, 1 + 64 },
- { AV_CPU_FLAG_SSE3SLOW, 1 + 64 },
- { AV_CPU_FLAG_SSSE3SLOW, 1 + 64 },
- { AV_CPU_FLAG_ATOM, 1 + 128 },
- { AV_CPU_FLAG_AVXSLOW, 1 + 128 },
- { AV_CPU_FLAG_SLOW_GATHER, 1 + 32 },
- };
/* Flags the transform wants */
uint64_t req_flags = flags;
@@ -519,13 +554,11 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
/* Loop through all codelets in all codelet lists to find matches
* to the requirements */
- while (codelet_list_num--) {
- const FFTXCodelet * const * list = codelet_list[codelet_list_num];
+ while (codelet_list_idx--) {
+ const FFTXCodelet * const * list = codelet_list[codelet_list_idx];
const FFTXCodelet *cd = NULL;
while ((cd = *list++)) {
- int max_factor = 0;
-
/* Check if the type matches */
if (cd->type != TX_TYPE_ANY && type != cd->type)
continue;
@@ -546,7 +579,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
/* Check if the CPU supports the required ISA */
if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL &&
- !(cpu_flags & (cd->cpu_flags & ~slow_mask)))
+ !(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask)))
continue;
/* Check for factors */
@@ -563,33 +596,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
cd_matches = cd_tmp;
cd_matches[nb_cd_matches].cd = cd;
- cd_matches[nb_cd_matches].prio = cd->prio;
-
- /* If the CPU has a SLOW flag, and the instruction is also flagged
- * as being slow for such, reduce its priority */
- for (int i = 0; i < FF_ARRAY_ELEMS(slow_penalties); i++) {
- if ((cpu_flags & cd->cpu_flags) & slow_penalties[i][0])
- cd_matches[nb_cd_matches].prio -= slow_penalties[i][1];
- }
-
- /* Prioritize aligned-only codelets */
- if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
- cd_matches[nb_cd_matches].prio += 64;
-
- /* Codelets for specific lengths are generally faster */
- if ((len == cd->min_len) && (len == cd->max_len))
- cd_matches[nb_cd_matches].prio += 64;
-
- /* Forward-only or inverse-only transforms are generally better */
- if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
- cd_matches[nb_cd_matches].prio += 64;
-
- /* Larger factors are generally better */
- for (int i = 0; i < TX_MAX_SUB; i++)
- max_factor = FFMAX(cd->factors[i], max_factor);
- if (max_factor)
- cd_matches[nb_cd_matches].prio += 16*max_factor;
-
+ cd_matches[nb_cd_matches].prio = get_codelet_prio(cd, cpu_flags, len);
nb_cd_matches++;
}
}