[FFmpeg-cvslog] lavu/tx: refactor and separate codelet list and prio code

Lynne git at videolan.org
Thu Nov 24 16:59:44 EET 2022


ffmpeg | branch: master | Lynne <dev at lynne.ee> | Thu Nov 17 22:14:53 2022 +0100| [1c8d77a2bfa239621b63c4553c6221560b1ee298] | committer: Lynne

lavu/tx: refactor and separate codelet list and prio code

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1c8d77a2bfa239621b63c4553c6221560b1ee298
---

 libavutil/tx.c | 125 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 66 insertions(+), 59 deletions(-)

diff --git a/libavutil/tx.c b/libavutil/tx.c
index 319392788f..ff81d235ba 100644
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@@ -300,6 +300,67 @@ static const FFTXCodelet * const ff_tx_null_list[] = {
     NULL,
 };
 
+/* Array of all compiled codelet lists. Order is irrelevant. */
+static const FFTXCodelet * const * const codelet_list[] = {
+    ff_tx_codelet_list_float_c,
+    ff_tx_codelet_list_double_c,
+    ff_tx_codelet_list_int32_c,
+    ff_tx_null_list,
+#if HAVE_X86ASM
+    ff_tx_codelet_list_float_x86,
+#endif
+#if ARCH_AARCH64
+    ff_tx_codelet_list_float_aarch64,
+#endif
+};
+static const int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
+
+static const int cpu_slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW  |
+                                 AV_CPU_FLAG_ATOM     | AV_CPU_FLAG_SSSE3SLOW |
+                                 AV_CPU_FLAG_AVXSLOW  | AV_CPU_FLAG_SLOW_GATHER;
+
+static const int cpu_slow_penalties[][2] = {
+    { AV_CPU_FLAG_SSE2SLOW,    1 + 64  },
+    { AV_CPU_FLAG_SSE3SLOW,    1 + 64  },
+    { AV_CPU_FLAG_SSSE3SLOW,   1 + 64  },
+    { AV_CPU_FLAG_ATOM,        1 + 128 },
+    { AV_CPU_FLAG_AVXSLOW,     1 + 128 },
+    { AV_CPU_FLAG_SLOW_GATHER, 1 + 32  },
+};
+
+static int get_codelet_prio(const FFTXCodelet *cd, int cpu_flags, int len)
+{
+    int prio = cd->prio;
+    int max_factor = 0;
+
+    /* If the CPU has a SLOW flag, and the instruction is also flagged
+     * as being slow for such, reduce its priority */
+    for (int i = 0; i < FF_ARRAY_ELEMS(cpu_slow_penalties); i++) {
+        if ((cpu_flags & cd->cpu_flags) & cpu_slow_penalties[i][0])
+            prio -= cpu_slow_penalties[i][1];
+    }
+
+    /* Prioritize aligned-only codelets */
+    if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
+        prio += 64;
+
+    /* Codelets for specific lengths are generally faster */
+    if ((len == cd->min_len) && (len == cd->max_len))
+        prio += 64;
+
+    /* Forward-only or inverse-only transforms are generally better */
+    if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
+        prio += 64;
+
+    /* Larger factors are generally better */
+    for (int i = 0; i < TX_MAX_SUB; i++)
+        max_factor = FFMAX(cd->factors[i], max_factor);
+    if (max_factor)
+        prio += 16*max_factor;
+
+    return prio;
+}
+
 #if !CONFIG_SMALL
 static void print_flags(AVBPrint *bp, uint64_t f)
 {
@@ -465,41 +526,15 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
     AVTXContext *sub = NULL;
     TXCodeletMatch *cd_tmp, *cd_matches = NULL;
     unsigned int cd_matches_size = 0;
+    int codelet_list_idx = codelet_list_num;
     int nb_cd_matches = 0;
 #if !CONFIG_SMALL
     AVBPrint bp = { 0 };
 #endif
 
-    /* Array of all compiled codelet lists. Order is irrelevant. */
-    const FFTXCodelet * const * const codelet_list[] = {
-        ff_tx_codelet_list_float_c,
-        ff_tx_codelet_list_double_c,
-        ff_tx_codelet_list_int32_c,
-        ff_tx_null_list,
-#if HAVE_X86ASM
-        ff_tx_codelet_list_float_x86,
-#endif
-#if ARCH_AARCH64
-        ff_tx_codelet_list_float_aarch64,
-#endif
-    };
-    int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
-
     /* We still accept functions marked with SLOW, even if the CPU is
      * marked with the same flag, but we give them lower priority. */
     const int cpu_flags = av_get_cpu_flags();
-    const int slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW  |
-                          AV_CPU_FLAG_ATOM     | AV_CPU_FLAG_SSSE3SLOW |
-                          AV_CPU_FLAG_AVXSLOW  | AV_CPU_FLAG_SLOW_GATHER;
-
-    static const int slow_penalties[][2] = {
-        { AV_CPU_FLAG_SSE2SLOW,    1 + 64  },
-        { AV_CPU_FLAG_SSE3SLOW,    1 + 64  },
-        { AV_CPU_FLAG_SSSE3SLOW,   1 + 64  },
-        { AV_CPU_FLAG_ATOM,        1 + 128 },
-        { AV_CPU_FLAG_AVXSLOW,     1 + 128 },
-        { AV_CPU_FLAG_SLOW_GATHER, 1 + 32  },
-    };
 
     /* Flags the transform wants */
     uint64_t req_flags = flags;
@@ -519,13 +554,11 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
 
     /* Loop through all codelets in all codelet lists to find matches
      * to the requirements */
-    while (codelet_list_num--) {
-        const FFTXCodelet * const * list = codelet_list[codelet_list_num];
+    while (codelet_list_idx--) {
+        const FFTXCodelet * const * list = codelet_list[codelet_list_idx];
         const FFTXCodelet *cd = NULL;
 
         while ((cd = *list++)) {
-            int max_factor = 0;
-
             /* Check if the type matches */
             if (cd->type != TX_TYPE_ANY && type != cd->type)
                 continue;
@@ -546,7 +579,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
 
             /* Check if the CPU supports the required ISA */
             if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL &&
-                !(cpu_flags & (cd->cpu_flags & ~slow_mask)))
+                !(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask)))
                 continue;
 
             /* Check for factors */
@@ -563,33 +596,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
 
             cd_matches                     = cd_tmp;
             cd_matches[nb_cd_matches].cd   = cd;
-            cd_matches[nb_cd_matches].prio = cd->prio;
-
-            /* If the CPU has a SLOW flag, and the instruction is also flagged
-             * as being slow for such, reduce its priority */
-            for (int i = 0; i < FF_ARRAY_ELEMS(slow_penalties); i++) {
-                if ((cpu_flags & cd->cpu_flags) & slow_penalties[i][0])
-                    cd_matches[nb_cd_matches].prio -= slow_penalties[i][1];
-            }
-
-            /* Prioritize aligned-only codelets */
-            if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
-                cd_matches[nb_cd_matches].prio += 64;
-
-            /* Codelets for specific lengths are generally faster */
-            if ((len == cd->min_len) && (len == cd->max_len))
-                cd_matches[nb_cd_matches].prio += 64;
-
-            /* Forward-only or inverse-only transforms are generally better */
-            if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
-                cd_matches[nb_cd_matches].prio += 64;
-
-            /* Larger factors are generally better */
-            for (int i = 0; i < TX_MAX_SUB; i++)
-                max_factor = FFMAX(cd->factors[i], max_factor);
-            if (max_factor)
-                cd_matches[nb_cd_matches].prio += 16*max_factor;
-
+            cd_matches[nb_cd_matches].prio = get_codelet_prio(cd, cpu_flags, len);
             nb_cd_matches++;
         }
     }



More information about the ffmpeg-cvslog mailing list