[FFmpeg-devel] [PATCH] sws: implement slice threading

Anton Khirnov anton at khirnov.net
Fri Sep 3 11:26:08 EEST 2021


---
Now actually committing the changes.
---
 libswscale/options.c          |  3 ++
 libswscale/swscale.c          | 59 ++++++++++++++++++++++++
 libswscale/swscale_internal.h | 14 ++++++
 libswscale/utils.c            | 85 +++++++++++++++++++++++++++++++++++
 4 files changed, 161 insertions(+)

diff --git a/libswscale/options.c b/libswscale/options.c
index 7eb2752543..4b71a23e37 100644
--- a/libswscale/options.c
+++ b/libswscale/options.c
@@ -81,6 +81,9 @@ static const AVOption swscale_options[] = {
     { "uniform_color",   "blend onto a uniform color",    0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_ALPHA_BLEND_UNIFORM},INT_MIN, INT_MAX,     VE, "alphablend" },
     { "checkerboard",    "blend onto a checkerboard",     0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_ALPHA_BLEND_CHECKERBOARD},INT_MIN, INT_MAX,     VE, "alphablend" },
 
+    { "threads",         "number of threads",             OFFSET(nb_threads),   AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, VE, "threads" },
+        { "auto",        NULL,                            0,                  AV_OPT_TYPE_CONST, {.i64 = 0 },    .flags = VE, "threads" },
+
     { NULL }
 };
 
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index ca5c612b18..c233818dcf 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1113,6 +1113,9 @@ int sws_send_slice(struct SwsContext *c, unsigned int slice_start,
 
 unsigned int sws_receive_slice_alignment(const struct SwsContext *c)
 {
+    if (c->slice_ctx)
+        return c->slice_ctx[0]->dst_slice_align;
+
     return c->dst_slice_align;
 }
 
@@ -1136,6 +1139,27 @@ int sws_receive_slice(struct SwsContext *c, unsigned int slice_start,
         return AVERROR(EINVAL);
     }
 
+    if (c->slicethread) {
+        int nb_jobs = c->slice_ctx[0]->dither == SWS_DITHER_ED ? 1 : c->nb_slice_ctx;
+        int ret = 0;
+
+        c->dst_slice_start  = slice_start;
+        c->dst_slice_height = slice_height;
+
+        avpriv_slicethread_execute(c->slicethread, nb_jobs, 0);
+
+        for (int i = 0; i < c->nb_slice_ctx; i++) {
+            if (c->slice_err[i] < 0) {
+                ret = c->slice_err[i];
+                break;
+            }
+        }
+
+        memset(c->slice_err, 0, c->nb_slice_ctx * sizeof(*c->slice_err));
+
+        return ret;
+    }
+
     for (int i = 0; i < FF_ARRAY_ELEMS(dst) && c->frame_dst->data[i]; i++) {
         dst[i] = c->frame_dst->data[i] +
                  c->frame_dst->linesize[i] * (slice_start >> c->chrDstVSubSample);
@@ -1173,6 +1197,41 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
                                   int srcSliceH, uint8_t *const dst[],
                                   const int dstStride[])
 {
+    if (c->nb_slice_ctx)
+        c = c->slice_ctx[0];
+
     return scale_internal(c, srcSlice, srcStride, srcSliceY, srcSliceH,
                           dst, dstStride, 0, c->dstH);
 }
+
+void ff_sws_slice_worker(void *priv, int jobnr, int threadnr,
+                         int nb_jobs, int nb_threads)
+{
+    SwsContext *parent = priv;
+    SwsContext      *c = parent->slice_ctx[threadnr];
+
+    const int slice_height = FFALIGN(FFMAX((parent->dst_slice_height + nb_jobs - 1) / nb_jobs, 1),
+                                     c->dst_slice_align);
+    const int slice_start  = jobnr * slice_height;
+    const int slice_end    = FFMIN((jobnr + 1) * slice_height, parent->dst_slice_height);
+    int err = 0;
+
+    if (slice_end > slice_start) {
+        uint8_t *dst[4] = { NULL };
+
+        for (int i = 0; i < FF_ARRAY_ELEMS(dst) && parent->frame_dst->data[i]; i++) {
+            const int vshift = (i == 1 || i == 2) ? c->chrDstVSubSample : 0;
+            const ptrdiff_t offset = parent->frame_dst->linesize[i] *
+                ((slice_start + parent->dst_slice_start) >> vshift);
+
+            dst[i] = parent->frame_dst->data[i] + offset;
+        }
+
+        err = scale_internal(c, (const uint8_t * const *)parent->frame_src->data,
+                             parent->frame_src->linesize, 0, c->srcH,
+                             dst, parent->frame_dst->linesize,
+                             parent->dst_slice_start + slice_start, slice_end - slice_start);
+    }
+
+    parent->slice_err[threadnr] = err;
+}
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 55fa6cec07..fbfc08a89f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -33,6 +33,7 @@
 #include "libavutil/mem_internal.h"
 #include "libavutil/pixfmt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/slicethread.h"
 #include "libavutil/ppc/util_altivec.h"
 
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
@@ -300,6 +301,15 @@ typedef struct SwsContext {
      */
     const AVClass *av_class;
 
+    AVSliceThread      *slicethread;
+    struct SwsContext **slice_ctx;
+    int                *slice_err;
+    int              nb_slice_ctx;
+
+    // values passed to current sws_receive_slice() call
+    unsigned int dst_slice_start;
+    unsigned int dst_slice_height;
+
     /**
      * Note that src, dst, srcStride, dstStride will be copied in the
      * sws_scale() wrapper so they can be freely modified here.
@@ -325,6 +335,7 @@ typedef struct SwsContext {
     int chrDstVSubSample;         ///< Binary logarithm of vertical   subsampling factor between luma/alpha and chroma planes in destination image.
     int vChrDrop;                 ///< Binary logarithm of extra vertical subsampling factor in source image chroma planes specified by user.
     int sliceDir;                 ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top).
+    int nb_threads;               ///< Number of threads used for scaling
     double param[2];              ///< Input parameters for scaling algorithms that need them.
 
     AVFrame *frame_src;
@@ -1082,6 +1093,9 @@ void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn yuv2plane1, yuv2planarX_fn
     yuv2interleavedX_fn yuv2nv12cX, yuv2packed1_fn yuv2packed1, yuv2packed2_fn yuv2packed2,
     yuv2packedX_fn yuv2packedX, yuv2anyX_fn yuv2anyX, int use_mmx);
 
+void ff_sws_slice_worker(void *priv, int jobnr, int threadnr,
+                         int nb_jobs, int nb_threads);
+
 //number of extra lines to process
 #define MAX_LINES_AHEAD 4
 
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 235a846809..25051ead72 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -49,6 +49,7 @@
 #include "libavutil/mathematics.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/slicethread.h"
 #include "libavutil/thread.h"
 #include "libavutil/aarch64/cpu.h"
 #include "libavutil/ppc/cpu.h"
@@ -871,6 +872,18 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
     const AVPixFmtDescriptor *desc_src;
     int need_reinit = 0;
 
+    if (c->nb_slice_ctx) {
+        for (int i = 0; i < c->nb_slice_ctx; i++) {
+            int ret = sws_setColorspaceDetails(c->slice_ctx[i], inv_table,
+                                               srcRange, table, dstRange,
+                                               brightness, contrast, saturation);
+            if (ret < 0)
+                return ret;
+        }
+
+        return 0;
+    }
+
     handle_formats(c);
     desc_dst = av_pix_fmt_desc_get(c->dstFormat);
     desc_src = av_pix_fmt_desc_get(c->srcFormat);
@@ -1005,6 +1018,12 @@ int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
     if (!c )
         return -1;
 
+    if (c->nb_slice_ctx) {
+        return sws_getColorspaceDetails(c->slice_ctx[0], inv_table, srcRange,
+                                        table, dstRange, brightness, contrast,
+                                        saturation);
+    }
+
     *inv_table  = c->srcColorspaceTable;
     *table      = c->dstColorspaceTable;
     *srcRange   = range_override_needed(c->srcFormat) ? 1 : c->srcRange;
@@ -1170,6 +1189,58 @@ static enum AVPixelFormat alphaless_fmt(enum AVPixelFormat fmt)
     }
 }
 
+static int context_init_threaded(SwsContext *c,
+                                 SwsFilter *src_filter, SwsFilter *dst_filter)
+{
+    int ret;
+
+    ret = avpriv_slicethread_create(&c->slicethread, (void*)c,
+                                    ff_sws_slice_worker, NULL, c->nb_threads);
+    if (ret == AVERROR(ENOSYS)) {
+        c->nb_threads = 1;
+        return 0;
+    } else if (ret < 0)
+        return ret;
+
+    c->nb_threads = ret;
+
+    c->slice_ctx = av_mallocz_array(c->nb_threads, sizeof(*c->slice_ctx));
+    c->slice_err = av_mallocz_array(c->nb_threads, sizeof(*c->slice_err));
+    if (!c->slice_ctx || !c->slice_err)
+        return AVERROR(ENOMEM);
+
+    for (int i = 0; i < c->nb_threads; i++) {
+        c->slice_ctx[i] = sws_alloc_context();
+        if (!c->slice_ctx[i])
+            return AVERROR(ENOMEM);
+
+        ret = av_opt_copy((void*)c->slice_ctx[i], (void*)c);
+        if (ret < 0)
+            return ret;
+
+        c->slice_ctx[i]->nb_threads = 1;
+
+        ret = sws_init_context(c->slice_ctx[i], src_filter, dst_filter);
+        if (ret < 0)
+            return ret;
+
+        c->nb_slice_ctx++;
+
+        if (c->slice_ctx[i]->dither == SWS_DITHER_ED) {
+            av_log(c, AV_LOG_VERBOSE,
+                   "Error-diffusion dither is in use, scaling will be single-threaded.");
+            break;
+        }
+    }
+
+    c->frame_src = av_frame_alloc();
+    c->frame_dst = av_frame_alloc();
+    if (!c->frame_src || !c->frame_dst)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
 av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                              SwsFilter *dstFilter)
 {
@@ -1192,6 +1263,13 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     static const float float_mult = 1.0f / 255.0f;
     static AVOnce rgb2rgb_once = AV_ONCE_INIT;
 
+    if (c->nb_threads != 1) {
+        ret = context_init_threaded(c, srcFilter, dstFilter);
+        if (ret < 0 || c->nb_threads > 1)
+            return ret;
+        // threading disabled in this build, init as single-threaded
+    }
+
     cpu_flags = av_get_cpu_flags();
     flags     = c->flags;
     emms_c();
@@ -2254,6 +2332,13 @@ void sws_freeContext(SwsContext *c)
     if (!c)
         return;
 
+    for (i = 0; i < c->nb_slice_ctx; i++)
+        sws_freeContext(c->slice_ctx[i]);
+    av_freep(&c->slice_ctx);
+    av_freep(&c->slice_err);
+
+    avpriv_slicethread_free(&c->slicethread);
+
     for (i = 0; i < 4; i++)
         av_freep(&c->dither_error[i]);
 
-- 
2.30.2



More information about the ffmpeg-devel mailing list