[FFmpeg-devel] [PATCH] sws: add a new scaling API

Sun Aug 29 19:38:58 EEST 2021

---
Now with a new public function to query required slice alignment, which
fixes the yuv410p->yuv420p conversion issue reported by Michael.
---
 libswscale/swscale.c          | 294 ++++++++++++++++++++++++++--------
 libswscale/swscale.h          |  90 +++++++++++
 libswscale/swscale_internal.h |  21 +++
 libswscale/swscale_unscaled.c |   2 +
 libswscale/utils.c            |  72 +++++++++
 5 files changed, 416 insertions(+), 63 deletions(-)

diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 61dfcb4dff..ca5c612b18 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -236,13 +236,16 @@ static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
         av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
 
 static int swscale(SwsContext *c, const uint8_t *src[],
-                   int srcStride[], int srcSliceY,
-                   int srcSliceH, uint8_t *dst[], int dstStride[])
+                   int srcStride[], int srcSliceY, int srcSliceH,
+                   uint8_t *dst[], int dstStride[],
+                   int dstSliceY, int dstSliceH)
 {
+    const int scale_dst = dstSliceY > 0 || dstSliceH < c->dstH;
+
     /* load a few things into local vars to make the code more readable?
      * and faster */
     const int dstW                   = c->dstW;
-    const int dstH                   = c->dstH;
+    int dstH                         = c->dstH;
 
     const enum AVPixelFormat dstFormat = c->dstFormat;
     const int flags                  = c->flags;
@@ -331,10 +334,15 @@ static int swscale(SwsContext *c, const uint8_t *src[],
         }
     }
 
-    /* Note the user might start scaling the picture in the middle so this
-     * will not get executed. This is not really intended but works
-     * currently, so people might do it. */
-    if (srcSliceY == 0) {
+    if (scale_dst) {
+        dstY         = dstSliceY;
+        dstH         = dstY + dstSliceH;
+        lastInLumBuf = -1;
+        lastInChrBuf = -1;
+    } else if (srcSliceY == 0) {
+        /* Note the user might start scaling the picture in the middle so this
+         * will not get executed. This is not really intended but works
+         * currently, so people might do it. */
         dstY         = 0;
         lastInLumBuf = -1;
         lastInChrBuf = -1;
@@ -352,8 +360,8 @@ static int swscale(SwsContext *c, const uint8_t *src[],
             srcSliceY, srcSliceH, chrSrcSliceY, chrSrcSliceH, 1);
 
     ff_init_slice_from_src(vout_slice, (uint8_t**)dst, dstStride, c->dstW,
-            dstY, dstH, dstY >> c->chrDstVSubSample,
-            AV_CEIL_RSHIFT(dstH, c->chrDstVSubSample), 0);
+            dstY, dstSliceH, dstY >> c->chrDstVSubSample,
+            AV_CEIL_RSHIFT(dstSliceH, c->chrDstVSubSample), scale_dst);
     if (srcSliceY == 0) {
         hout_slice->plane[0].sliceY = lastInLumBuf + 1;
         hout_slice->plane[1].sliceY = lastInChrBuf + 1;
@@ -373,7 +381,7 @@ static int swscale(SwsContext *c, const uint8_t *src[],
 
         // First line needed as input
         const int firstLumSrcY  = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]);
-        const int firstLumSrcY2 = FFMAX(1 - vLumFilterSize, vLumFilterPos[FFMIN(dstY | ((1 << c->chrDstVSubSample) - 1), dstH - 1)]);
+        const int firstLumSrcY2 = FFMAX(1 - vLumFilterSize, vLumFilterPos[FFMIN(dstY | ((1 << c->chrDstVSubSample) - 1), c->dstH - 1)]);
         // First line needed as input
         const int firstChrSrcY  = FFMAX(1 - vChrFilterSize, vChrFilterPos[chrDstY]);
 
@@ -477,7 +485,7 @@ static int swscale(SwsContext *c, const uint8_t *src[],
             c->chrDither8 = ff_dither_8x8_128[chrDstY & 7];
             c->lumDither8 = ff_dither_8x8_128[dstY    & 7];
         }
-        if (dstY >= dstH - 2) {
+        if (dstY >= c->dstH - 2) {
             /* hmm looks like we can't use MMX here without overwriting
              * this array's tail */
             ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
@@ -491,21 +499,22 @@ static int swscale(SwsContext *c, const uint8_t *src[],
             desc[i].process(c, &desc[i], dstY, 1);
     }
     if (isPlanar(dstFormat) && isALPHA(dstFormat) && !needAlpha) {
+        int offset = lastDstY - dstSliceY;
         int length = dstW;
         int height = dstY - lastDstY;
 
         if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
             const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
-            fillPlane16(dst[3], dstStride[3], length, height, lastDstY,
+            fillPlane16(dst[3], dstStride[3], length, height, offset,
                     1, desc->comp[3].depth,
                     isBE(dstFormat));
         } else if (is32BPS(dstFormat)) {
             const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
-            fillPlane32(dst[3], dstStride[3], length, height, lastDstY,
+            fillPlane32(dst[3], dstStride[3], length, height, offset,
                     1, desc->comp[3].depth,
                     isBE(dstFormat), desc->flags & AV_PIX_FMT_FLAG_FLOAT);
         } else
-            fillPlane(dst[3], dstStride[3], length, height, lastDstY, 255);
+            fillPlane(dst[3], dstStride[3], length, height, offset, 255);
     }
 
 #if HAVE_MMXEXT_INLINE
@@ -809,33 +818,42 @@ static void update_palette(SwsContext *c, const uint32_t *pal)
     }
 }
 
+static int scale_internal(SwsContext *c,
+                          const uint8_t * const srcSlice[], const int srcStride[],
+                          int srcSliceY, int srcSliceH,
+                          uint8_t *const dstSlice[], const int dstStride[],
+                          int dstSliceY, int dstSliceH);
+
 static int scale_gamma(SwsContext *c,
                        const uint8_t * const srcSlice[], const int srcStride[],
                        int srcSliceY, int srcSliceH,
-                       uint8_t * const dst[], const int dstStride[])
+                       uint8_t * const dstSlice[], const int dstStride[],
+                       int dstSliceY, int dstSliceH)
 {
-    int ret = sws_scale(c->cascaded_context[0],
-                        srcSlice, srcStride, srcSliceY, srcSliceH,
-                        c->cascaded_tmp, c->cascaded_tmpStride);
+    int ret = scale_internal(c->cascaded_context[0],
+                             srcSlice, srcStride, srcSliceY, srcSliceH,
+                             c->cascaded_tmp, c->cascaded_tmpStride, 0, c->srcH);
 
     if (ret < 0)
         return ret;
 
     if (c->cascaded_context[2])
-        ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp,
-                        c->cascaded_tmpStride, srcSliceY, srcSliceH, c->cascaded1_tmp,
-                        c->cascaded1_tmpStride);
+        ret = scale_internal(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp,
+                             c->cascaded_tmpStride, srcSliceY, srcSliceH,
+                             c->cascaded1_tmp, c->cascaded1_tmpStride, 0, c->dstH);
     else
-        ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp,
-                        c->cascaded_tmpStride, srcSliceY, srcSliceH, dst, dstStride);
+        ret = scale_internal(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp,
+                             c->cascaded_tmpStride, srcSliceY, srcSliceH,
+                             dstSlice, dstStride, dstSliceY, dstSliceH);
 
     if (ret < 0)
         return ret;
 
     if (c->cascaded_context[2]) {
-        ret = sws_scale(c->cascaded_context[2], (const uint8_t * const *)c->cascaded1_tmp,
-                        c->cascaded1_tmpStride, c->cascaded_context[1]->dstY - ret,
-                        c->cascaded_context[1]->dstY, dst, dstStride);
+        ret = scale_internal(c->cascaded_context[2], (const uint8_t * const *)c->cascaded1_tmp,
+                             c->cascaded1_tmpStride, c->cascaded_context[1]->dstY - ret,
+                             c->cascaded_context[1]->dstY,
+                             dstSlice, dstStride, dstSliceY, dstSliceH);
     }
     return ret;
 }
@@ -843,56 +861,64 @@ static int scale_gamma(SwsContext *c,
 static int scale_cascaded(SwsContext *c,
                           const uint8_t * const srcSlice[], const int srcStride[],
                           int srcSliceY, int srcSliceH,
-                          uint8_t * const dst[], const int dstStride[])
+                          uint8_t * const dstSlice[], const int dstStride[],
+                          int dstSliceY, int dstSliceH)
 {
-    int ret = sws_scale(c->cascaded_context[0],
-                        srcSlice, srcStride, srcSliceY, srcSliceH,
-                        c->cascaded_tmp, c->cascaded_tmpStride);
+    int ret = scale_internal(c->cascaded_context[0],
+                             srcSlice, srcStride, srcSliceY, srcSliceH,
+                             c->cascaded_tmp, c->cascaded_tmpStride,
+                             0, c->cascaded_context[0]->dstH);
     if (ret < 0)
         return ret;
-    ret = sws_scale(c->cascaded_context[1],
-                    (const uint8_t * const * )c->cascaded_tmp, c->cascaded_tmpStride,
-                    0, c->cascaded_context[0]->dstH, dst, dstStride);
+    ret = scale_internal(c->cascaded_context[1],
+                         (const uint8_t * const * )c->cascaded_tmp, c->cascaded_tmpStride,
+                         0, c->cascaded_context[0]->dstH,
+                         dstSlice, dstStride, dstSliceY, dstSliceH);
     return ret;
 }
 
-/**
- * swscale wrapper, so we don't need to export the SwsContext.
- * Assumes planar YUV to be in YUV order instead of YVU.
- */
-int attribute_align_arg sws_scale(struct SwsContext *c,
-                                  const uint8_t * const srcSlice[],
-                                  const int srcStride[], int srcSliceY,
-                                  int srcSliceH, uint8_t *const dst[],
-                                  const int dstStride[])
+static int scale_internal(SwsContext *c,
+                          const uint8_t * const srcSlice[], const int srcStride[],
+                          int srcSliceY, int srcSliceH,
+                          uint8_t *const dstSlice[], const int dstStride[],
+                          int dstSliceY, int dstSliceH)
 {
-    const int frame_start = !c->sliceDir;
+    const int scale_dst = dstSliceY > 0 || dstSliceH < c->dstH;
+    const int frame_start = scale_dst || !c->sliceDir;
     int i, ret;
     const uint8_t *src2[4];
     uint8_t *dst2[4];
-    int macro_height = isBayer(c->srcFormat) ? 2 : (1 << c->chrSrcVSubSample);
+    int macro_height_src = isBayer(c->srcFormat) ? 2 : (1 << c->chrSrcVSubSample);
+    int macro_height_dst = isBayer(c->dstFormat) ? 2 : (1 << c->chrDstVSubSample);
     // copy strides, so they can safely be modified
     int srcStride2[4];
     int dstStride2[4];
     int srcSliceY_internal = srcSliceY;
 
-    if (!srcStride || !dstStride || !dst || !srcSlice) {
+    if (!srcStride || !dstStride || !dstSlice || !srcSlice) {
         av_log(c, AV_LOG_ERROR, "One of the input parameters to sws_scale() is NULL, please check the calling code\n");
         return AVERROR(EINVAL);
     }
 
-    if ((srcSliceY & (macro_height-1)) ||
-        ((srcSliceH& (macro_height-1)) && srcSliceY + srcSliceH != c->srcH) ||
+    if ((srcSliceY  & (macro_height_src - 1)) ||
+        ((srcSliceH & (macro_height_src - 1)) && srcSliceY + srcSliceH != c->srcH) ||
         srcSliceY + srcSliceH > c->srcH) {
         av_log(c, AV_LOG_ERROR, "Slice parameters %d, %d are invalid\n", srcSliceY, srcSliceH);
         return AVERROR(EINVAL);
     }
 
+    if ((dstSliceY  & (macro_height_dst - 1)) ||
+        ((dstSliceH & (macro_height_dst - 1)) && dstSliceY + dstSliceH != c->dstH) ||
+        dstSliceY + dstSliceH > c->dstH) {
+        av_log(c, AV_LOG_ERROR, "Slice parameters %d, %d are invalid\n", dstSliceY, dstSliceH);
+        return AVERROR(EINVAL);
+    }
+
     if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) {
         av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
         return AVERROR(EINVAL);
     }
-    if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) {
+    if (!check_image_pointers((const uint8_t* const*)dstSlice, c->dstFormat, dstStride)) {
         av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
         return AVERROR(EINVAL);
     }
@@ -902,10 +928,12 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
         return 0;
 
     if (c->gamma_flag && c->cascaded_context[0])
-        return scale_gamma(c, srcSlice, srcStride, srcSliceY, srcSliceH, dst, dstStride);
+        return scale_gamma(c, srcSlice, srcStride, srcSliceY, srcSliceH,
+                           dstSlice, dstStride, dstSliceY, dstSliceH);
 
     if (c->cascaded_context[0] && srcSliceY == 0 && srcSliceH == c->cascaded_context[0]->srcH)
-        return scale_cascaded(c, srcSlice, srcStride, srcSliceY, srcSliceH, dst, dstStride);
+        return scale_cascaded(c, srcSlice, srcStride, srcSliceY, srcSliceH,
+                              dstSlice, dstStride, dstSliceY, dstSliceH);
 
     if (!srcSliceY && (c->flags & SWS_BITEXACT) && c->dither == SWS_DITHER_ED && c->dither_error[0])
         for (i = 0; i < 4; i++)
@@ -915,18 +943,19 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
         update_palette(c, (const uint32_t *)srcSlice[1]);
 
     memcpy(src2,       srcSlice,  sizeof(src2));
-    memcpy(dst2,       dst,       sizeof(dst2));
+    memcpy(dst2,       dstSlice,  sizeof(dst2));
     memcpy(srcStride2, srcStride, sizeof(srcStride2));
     memcpy(dstStride2, dstStride, sizeof(dstStride2));
 
-    if (frame_start) {
+    if (frame_start && !scale_dst) {
         if (srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
             av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
             return AVERROR(EINVAL);
         }
 
         c->sliceDir = (srcSliceY == 0) ? 1 : -1;
-    }
+    } else if (scale_dst)
+        c->sliceDir = 1;
 
     if (c->src0Alpha && !c->dst0Alpha && isALPHA(c->dstFormat)) {
         uint8_t *base;
@@ -985,26 +1014,165 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
     reset_ptr(src2, c->srcFormat);
     reset_ptr((void*)dst2, c->dstFormat);
 
-    if (c->convert_unscaled)
-        ret = c->convert_unscaled(c, src2, srcStride2, srcSliceY_internal, srcSliceH,
+    if (c->convert_unscaled) {
+        int offset  = srcSliceY_internal;
+        int slice_h = srcSliceH;
+
+        // for dst slice scaling, offset the src pointers to match the dst slice
+        if (scale_dst) {
+            av_assert0(offset == 0);
+            for (i = 0; i < 4 && src2[i]; i++) {
+                if (!src2[i] || (i > 0 && usePal(c->srcFormat)))
+                    break;
+                src2[i] += (dstSliceY >> ((i == 1 || i == 2) ? c->chrSrcVSubSample : 0)) * srcStride2[i];
+            }
+            offset  = 0;
+            slice_h = dstSliceH;
+        }
+
+        ret = c->convert_unscaled(c, src2, srcStride2, offset, slice_h,
                                   dst2, dstStride2);
-    else
-        ret = swscale(c, src2, srcStride2, srcSliceY_internal, srcSliceH, dst2, dstStride2);
+    } else {
+        ret = swscale(c, src2, srcStride2, srcSliceY_internal, srcSliceH,
+                      dst2, dstStride2, dstSliceY, dstSliceH);
+    }
 
     if (c->dstXYZ && !(c->srcXYZ && c->srcW==c->dstW && c->srcH==c->dstH)) {
-        int dstY = c->dstY ? c->dstY : srcSliceY + srcSliceH;
-        uint16_t *dst16 = (uint16_t*)(dst2[0] + (dstY - ret) * dstStride2[0]);
-        av_assert0(dstY >= ret);
-        av_assert0(ret >= 0);
-        av_assert0(c->dstH >= dstY);
+        uint16_t *dst16;
+
+        if (scale_dst) {
+            dst16 = (uint16_t *)dst2[0];
+        } else {
+            int dstY = c->dstY ? c->dstY : srcSliceY + srcSliceH;
+
+            av_assert0(dstY >= ret);
+            av_assert0(ret >= 0);
+            av_assert0(c->dstH >= dstY);
+            dst16 = (uint16_t*)(dst2[0] + (dstY - ret) * dstStride2[0]);
+        }
 
         /* replace on the same data */
         rgb48Toxyz12(c, dst16, dst16, dstStride2[0]/2, ret);
     }
 
     /* reset slice direction at end of frame */
-    if (srcSliceY_internal + srcSliceH == c->srcH)
+    if ((srcSliceY_internal + srcSliceH == c->srcH) || scale_dst)
         c->sliceDir = 0;
 
     return ret;
 }
+
+void sws_frame_end(struct SwsContext *c)
+{
+    av_frame_unref(c->frame_src);
+    av_frame_unref(c->frame_dst);
+    c->src_ranges.nb_ranges = 0;
+}
+
+int sws_frame_start(struct SwsContext *c, AVFrame *dst, const AVFrame *src)
+{
+    int ret, allocated = 0;
+
+    ret = av_frame_ref(c->frame_src, src);
+    if (ret < 0)
+        return ret;
+
+    if (!dst->buf[0]) {
+        dst->width  = c->dstW;
+        dst->height = c->dstH;
+        dst->format = c->dstFormat;
+
+        ret = av_frame_get_buffer(dst, 0);
+        if (ret < 0)
+            return ret;
+        allocated = 1;
+    }
+
+    ret = av_frame_ref(c->frame_dst, dst);
+    if (ret < 0) {
+        if (allocated)
+            av_frame_unref(dst);
+
+        return ret;
+    }
+
+    return 0;
+}
+
+int sws_send_slice(struct SwsContext *c, unsigned int slice_start,
+                   unsigned int slice_height)
+{
+    int ret;
+
+    ret = ff_range_add(&c->src_ranges, slice_start, slice_height);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+unsigned int sws_receive_slice_alignment(const struct SwsContext *c)
+{
+    return c->dst_slice_align;
+}
+
+int sws_receive_slice(struct SwsContext *c, unsigned int slice_start,
+                      unsigned int slice_height)
+{
+    unsigned int align = sws_receive_slice_alignment(c);
+    uint8_t *dst[4];
+
+    /* wait until complete input has been received */
+    if (!(c->src_ranges.nb_ranges == 1        &&
+          c->src_ranges.ranges[0].start == 0 &&
+          c->src_ranges.ranges[0].len == c->srcH))
+        return AVERROR(EAGAIN);
+
+    if ((slice_start > 0 || slice_height < c->dstH) &&
+        (slice_start % align || slice_height % align)) {
+        av_log(c, AV_LOG_ERROR,
+               "Incorrectly aligned output: %u/%u not multiples of %u\n",
+               slice_start, slice_height, align);
+        return AVERROR(EINVAL);
+    }
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(dst) && c->frame_dst->data[i]; i++) {
+        dst[i] = c->frame_dst->data[i] +
+                 c->frame_dst->linesize[i] * (slice_start >> c->chrDstVSubSample);
+    }
+
+    return scale_internal(c, (const uint8_t * const *)c->frame_src->data,
+                          c->frame_src->linesize, 0, c->srcH,
+                          dst, c->frame_dst->linesize, slice_start, slice_height);
+}
+
+int sws_scale_frame(struct SwsContext *c, AVFrame *dst, const AVFrame *src)
+{
+    int ret;
+
+    ret = sws_frame_start(c, dst, src);
+    if (ret < 0)
+        return ret;
+
+    ret = sws_send_slice(c, 0, src->height);
+    if (ret >= 0)
+        ret = sws_receive_slice(c, 0, dst->height);
+
+    sws_frame_end(c);
+
+    return ret;
+}
+
+/**
+ * swscale wrapper, so we don't need to export the SwsContext.
+ * Assumes planar YUV to be in YUV order instead of YVU.
+ */
+int attribute_align_arg sws_scale(struct SwsContext *c,
+                                  const uint8_t * const srcSlice[],
+                                  const int srcStride[], int srcSliceY,
+                                  int srcSliceH, uint8_t *const dst[],
+                                  const int dstStride[])
+{
+    return scale_internal(c, srcSlice, srcStride, srcSliceY, srcSliceH,
+                          dst, dstStride, 0, c->dstH);
+}
diff --git a/libswscale/swscale.h b/libswscale/swscale.h
index 50d6d46553..77067e79dc 100644
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -30,6 +30,7 @@
 #include <stdint.h>
 
 #include "libavutil/avutil.h"
+#include "libavutil/frame.h"
 #include "libavutil/log.h"
 #include "libavutil/pixfmt.h"
 #include "version.h"
@@ -218,6 +219,95 @@ int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
               const int srcStride[], int srcSliceY, int srcSliceH,
               uint8_t *const dst[], const int dstStride[]);
 
+/**
+ * Scale source data from src and write the output to dst.
+ *
+ * This is merely a convenience wrapper around
+ * - sws_frame_start()
+ * - sws_send_slice(0, src->height)
+ * - sws_receive_slice(0, dst->height)
+ * - sws_frame_end()
+ *
+ * @param dst The destination frame. See documentation for sws_frame_start() for
+ *            more details.
+ * @param src The source frame.
+ *
+ * @return 0 on success, a negative AVERROR code on failure
+ */
+int sws_scale_frame(struct SwsContext *c, AVFrame *dst, const AVFrame *src);
+
+/**
+ * Initialize the scaling process for a given pair of source/destination frames.
+ * Must be called before any calls to sws_send_slice() and sws_receive_slice().
+ *
+ * This function will retain references to src and dst, so they must both use
+ * refcounted buffers (if allocated by the caller, in case of dst).
+ *
+ * @param dst The destination frame.
+ *
+ *            The data buffers may either be already allocated by the caller or
+ *            left clear, in which case they will be allocated by the scaler.
+ *            The latter may have performance advantages - e.g. in certain cases
+ *            some output planes may be references to input planes, rather than
+ *            copies.
+ *
+ *            Output data will be written into this frame in successful
+ *            sws_receive_slice() calls.
+ * @param src The source frame. The data buffers must be allocated, but the
+ *            frame data does not have to be ready at this point. Data
+ *            availability is then signalled by sws_send_slice().
+ * @return 0 on success, a negative AVERROR code on failure
+ *
+ * @see sws_frame_end()
+ */
+int sws_frame_start(struct SwsContext *c, AVFrame *dst, const AVFrame *src);
+
+/**
+ * Finish the scaling process for a pair of source/destination frames previously
+ * submitted with sws_frame_start(). Must be called after all sws_send_slice()
+ * and sws_receive_slice() calls are done, before any new sws_frame_start()
+ * calls.
+ */
+void sws_frame_end(struct SwsContext *c);
+
+/**
+ * Indicate that a horizontal slice of input data is available in the source
+ * frame previously provided to sws_frame_start(). The slices may be provided in
+ * any order, but may not overlap. For vertically subsampled pixel formats, the
+ * slices must be aligned according to subsampling.
+ *
+ * @param slice_start first row of the slice
+ * @param slice_height number of rows in the slice
+ *
+ * @return a non-negative number on success, a negative AVERROR code on failure.
+ */
+int sws_send_slice(struct SwsContext *c, unsigned int slice_start,
+                   unsigned int slice_height);
+
+/**
+ * Request a horizontal slice of the output data to be written into the frame
+ * previously provided to sws_frame_start().
+ *
+ * When a slice smaller than the whole output frame is requested, both
+ * slice_start and slice_height must be multiples of
+ * sws_receive_slice_alignment().
+ *
+ * @param slice_start first row of the slice
+ * @param slice_height number of rows in the slice
+ *
+ * @return a non-negative number if the data was successfully written into the output
+ *         AVERROR(EAGAIN) if more input data needs to be provided before the
+ *                         output can be produced
+ *         another negative AVERROR code on other kinds of scaling failure
+ */
+int sws_receive_slice(struct SwsContext *c, unsigned int slice_start,
+                      unsigned int slice_height);
+
+/**
+ * Query required alignment on output data requested with sws_receive_slice().
+ */
+unsigned int sws_receive_slice_alignment(const struct SwsContext *c);
+
 /**
  * @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg)
  * @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg)
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 673407636a..55fa6cec07 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -27,6 +27,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/avutil.h"
 #include "libavutil/common.h"
+#include "libavutil/frame.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/log.h"
 #include "libavutil/mem_internal.h"
@@ -80,6 +81,19 @@ typedef enum SwsAlphaBlend {
     SWS_ALPHA_BLEND_NB,
 } SwsAlphaBlend;
 
+typedef struct Range {
+    unsigned int start;
+    unsigned int len;
+} Range;
+
+typedef struct RangeList {
+    Range          *ranges;
+    unsigned int nb_ranges;
+    int             ranges_allocated;
+} RangeList;
+
+int ff_range_add(RangeList *r, unsigned int start, unsigned int len);
+
 typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t *src[],
                        int srcStride[], int srcSliceY, int srcSliceH,
                        uint8_t *dst[], int dstStride[]);
@@ -313,6 +327,11 @@ typedef struct SwsContext {
     int sliceDir;                 ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top).
     double param[2];              ///< Input parameters for scaling algorithms that need them.
 
+    AVFrame *frame_src;
+    AVFrame *frame_dst;
+
+    RangeList src_ranges;
+
     /* The cascaded_* fields allow spliting a scaler task into multiple
      * sequential steps, this is for example used to limit the maximum
      * downscaling factor that needs to be supported in one scaler.
@@ -638,6 +657,8 @@ typedef struct SwsContext {
     // then passed as input to further conversion
     uint8_t     *xyz_scratch;
     unsigned int xyz_scratch_allocated;
+
+    unsigned int dst_slice_align;
 } SwsContext;
 //FIXME check init (where 0)
 
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index c83af8bb07..7cb2a62f07 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -2009,6 +2009,7 @@ void ff_get_unscaled_swscale(SwsContext *c)
          srcFormat == AV_PIX_FMT_YUVA420P) && isAnyRGB(dstFormat) &&
         !(flags & SWS_ACCURATE_RND) && (c->dither == SWS_DITHER_BAYER || c->dither == SWS_DITHER_AUTO) && !(dstH & 1)) {
         c->convert_unscaled = ff_yuv2rgb_get_func_ptr(c);
+        c->dst_slice_align = 2;
     }
     /* yuv420p1x_to_p01x */
     if ((srcFormat == AV_PIX_FMT_YUV420P10 || srcFormat == AV_PIX_FMT_YUVA420P10 ||
@@ -2028,6 +2029,7 @@ void ff_get_unscaled_swscale(SwsContext *c)
         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
         !(flags & SWS_BITEXACT)) {
         c->convert_unscaled = yvu9ToYv12Wrapper;
+        c->dst_slice_align = 4;
     }
 
     /* bgr24toYV12 */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 176fc6fd63..235a846809 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1300,6 +1300,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     av_pix_fmt_get_chroma_sub_sample(srcFormat, &c->chrSrcHSubSample, &c->chrSrcVSubSample);
     av_pix_fmt_get_chroma_sub_sample(dstFormat, &c->chrDstHSubSample, &c->chrDstVSubSample);
 
+    c->dst_slice_align = 1 << c->chrDstVSubSample;
+
     if (isAnyRGB(dstFormat) && !(flags&SWS_FULL_CHR_H_INT)) {
         if (dstW&1) {
             av_log(c, AV_LOG_DEBUG, "Forcing full internal H chroma due to odd output size\n");
@@ -1424,6 +1426,11 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     if (!FF_ALLOCZ_TYPED_ARRAY(c->formatConvBuffer, FFALIGN(srcW * 2 + 78, 16) * 2))
         goto nomem;
 
+    c->frame_src = av_frame_alloc();
+    c->frame_dst = av_frame_alloc();
+    if (!c->frame_src || !c->frame_dst)
+        goto nomem;
+
     c->srcBpc = desc_src->comp[0].depth;
     if (c->srcBpc < 8)
         c->srcBpc = 8;
@@ -2250,6 +2257,11 @@ void sws_freeContext(SwsContext *c)
     for (i = 0; i < 4; i++)
         av_freep(&c->dither_error[i]);
 
+    av_frame_free(&c->frame_src);
+    av_frame_free(&c->frame_dst);
+
+    av_freep(&c->src_ranges.ranges);
+
     av_freep(&c->vLumFilter);
     av_freep(&c->vChrFilter);
     av_freep(&c->hLumFilter);
@@ -2364,3 +2376,63 @@ struct SwsContext *sws_getCachedContext(struct SwsContext *context, int srcW,
     }
     return context;
 }
+
+int ff_range_add(RangeList *rl, unsigned int start, unsigned int len)
+{
+    Range *tmp;
+    unsigned int idx;
+
+    /* find the first existing range after the new one */
+    for (idx = 0; idx < rl->nb_ranges; idx++)
+        if (rl->ranges[idx].start > start)
+            break;
+
+    /* check for overlap */
+    if (idx > 0) {
+        Range *prev = &rl->ranges[idx - 1];
+        if (prev->start + prev->len > start)
+            return AVERROR(EINVAL);
+    }
+    if (idx < rl->nb_ranges) {
+        Range *next = &rl->ranges[idx];
+        if (start + len > next->start)
+            return AVERROR(EINVAL);
+    }
+
+    tmp = av_fast_realloc(rl->ranges, &rl->ranges_allocated,
+                          (rl->nb_ranges + 1) * sizeof(*rl->ranges));
+    if (!tmp)
+        return AVERROR(ENOMEM);
+    rl->ranges = tmp;
+
+    memmove(rl->ranges + idx + 1, rl->ranges + idx,
+            sizeof(*rl->ranges) * (rl->nb_ranges - idx));
+    rl->ranges[idx].start = start;
+    rl->ranges[idx].len   = len;
+    rl->nb_ranges++;
+
+    /* merge ranges */
+    if (idx > 0) {
+        Range *prev = &rl->ranges[idx - 1];
+        Range *cur  = &rl->ranges[idx];
+        if (prev->start + prev->len == cur->start) {
+            prev->len += cur->len;
+            memmove(rl->ranges + idx - 1, rl->ranges + idx,
+                    sizeof(*rl->ranges) * (rl->nb_ranges - idx));
+            rl->nb_ranges--;
+            idx--;
+        }
+    }
+    if (idx < rl->nb_ranges - 1) {
+        Range *cur  = &rl->ranges[idx];
+        Range *next = &rl->ranges[idx + 1];
+        if (cur->start + cur->len == next->start) {
+            cur->len += next->len;
+            memmove(rl->ranges + idx, rl->ranges + idx + 1,
+                    sizeof(*rl->ranges) * (rl->nb_ranges - idx - 1));
+            rl->nb_ranges--;
+        }
+    }
+
+    return 0;
+}
-- 
2.30.2