[FFmpeg-devel] [PATCH 09/10] diracdec: run the final decoding stage/idwt for every plane in parallel

Rostislav Pehlivanov rpehlivanov at ob-encoder.com
Thu Jun 23 19:07:03 CEST 2016


27% performance increase for a 12bit 4k file.

Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
---
 libavcodec/diracdec.c | 152 ++++++++++++++++++++++++++------------------------
 1 file changed, 80 insertions(+), 72 deletions(-)

diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c
index 63eb4d1..ec45132 100644
--- a/libavcodec/diracdec.c
+++ b/libavcodec/diracdec.c
@@ -1804,99 +1804,107 @@ static int interpolate_refplane(DiracContext *s, DiracFrame *ref, int plane, int
     return 0;
 }
 
-/**
- * Dirac Specification ->
- * 13.0 Transform data syntax. transform_data()
- */
-static int dirac_decode_frame_internal(DiracContext *s)
+static int decode_plane(AVCodecContext *avctx, void *arg, int jobnr, int thread)
 {
     DWTContext d;
-    int y, i, comp, dsty;
-    int ret;
+    int i, y, ret, dsty;
+    DiracContext *s = avctx->priv_data;
+    Plane *p        = &s->plane[jobnr];
+    uint8_t *frame  = s->current_picture->avframe->data[jobnr];
 
-    if (s->low_delay) {
-        /* [DIRAC_STD] 13.5.1 low_delay_transform_data() */
-        for (comp = 0; comp < 3; comp++) {
-            Plane *p = &s->plane[comp];
-            memset(p->idwt.buf, 0, p->idwt.stride * p->idwt.height);
-        }
-        if (!s->zero_res) {
-            if ((ret = decode_lowdelay(s)) < 0)
-                return ret;
-        }
+    /* FIXME: small resolutions */
+    for (i = 0; i < 4; i++)
+        s->edge_emu_buffer[i] = s->edge_emu_buffer_base + i*FFALIGN(p->width, 16);
+
+    if (!s->zero_res && !s->low_delay)
+    {
+        memset(p->idwt.buf, 0, p->idwt.stride * p->idwt.height);
+        decode_component(s, jobnr); /* [DIRAC_STD] 13.4.1 core_transform_data() */
     }
+    ret = ff_spatial_idwt_init(&d, &p->idwt, s->wavelet_idx+2,
+                               s->wavelet_depth, s->bit_depth);
+    if (ret < 0)
+        return ret;
 
-    for (comp = 0; comp < 3; comp++) {
-        Plane *p       = &s->plane[comp];
-        uint8_t *frame = s->current_picture->avframe->data[comp];
+    if (!s->num_refs) { /* intra */
+        for (y = 0; y < p->height; y += 16) {
+            int idx = (s->bit_depth - 8) >> 1;
+            ff_spatial_idwt_slice2(&d, y+16); /* decode */
+            s->diracdsp.put_signed_rect_clamped[idx](frame + y*p->stride,
+                                                     p->stride,
+                                                     p->idwt.buf + y*p->idwt.stride,
+                                                     p->idwt.stride, p->width, 16);
+        }
+    } else { /* inter */
+        int rowheight = p->ybsep*p->stride;
 
-        /* FIXME: small resolutions */
-        for (i = 0; i < 4; i++)
-            s->edge_emu_buffer[i] = s->edge_emu_buffer_base + i*FFALIGN(p->width, 16);
+        select_dsp_funcs(s, p->width, p->height, p->xblen, p->yblen);
 
-        if (!s->zero_res && !s->low_delay)
-        {
-            memset(p->idwt.buf, 0, p->idwt.stride * p->idwt.height);
-            decode_component(s, comp); /* [DIRAC_STD] 13.4.1 core_transform_data() */
+        for (i = 0; i < s->num_refs; i++) {
+            int ret = interpolate_refplane(s, s->ref_pics[i], jobnr, p->width, p->height);
+            if (ret < 0)
+                return ret;
         }
-        ret = ff_spatial_idwt_init(&d, &p->idwt, s->wavelet_idx+2,
-                                   s->wavelet_depth, s->bit_depth);
-        if (ret < 0)
-            return ret;
 
-        if (!s->num_refs) { /* intra */
-            for (y = 0; y < p->height; y += 16) {
-                int idx = (s->bit_depth - 8) >> 1;
-                ff_spatial_idwt_slice2(&d, y+16); /* decode */
-                s->diracdsp.put_signed_rect_clamped[idx](frame + y*p->stride,
-                                                         p->stride,
-                                                         p->idwt.buf + y*p->idwt.stride,
-                                                         p->idwt.stride, p->width, 16);
-            }
-        } else { /* inter */
-            int rowheight = p->ybsep*p->stride;
+        memset(s->mctmp, 0, 4*p->yoffset*p->stride);
 
-            select_dsp_funcs(s, p->width, p->height, p->xblen, p->yblen);
+        dsty = -p->yoffset;
+        for (y = 0; y < s->blheight; y++) {
+            int h     = 0,
+                start = FFMAX(dsty, 0);
+            uint16_t *mctmp    = s->mctmp + y*rowheight;
+            DiracBlock *blocks = s->blmotion + y*s->blwidth;
 
-            for (i = 0; i < s->num_refs; i++) {
-                int ret = interpolate_refplane(s, s->ref_pics[i], comp, p->width, p->height);
-                if (ret < 0)
-                    return ret;
-            }
+            init_obmc_weights(s, p, y);
 
-            memset(s->mctmp, 0, 4*p->yoffset*p->stride);
+            if (y == s->blheight-1 || start+p->ybsep > p->height)
+                h = p->height - start;
+            else
+                h = p->ybsep - (start - dsty);
+            if (h < 0)
+                break;
 
-            dsty = -p->yoffset;
-            for (y = 0; y < s->blheight; y++) {
-                int h     = 0,
-                    start = FFMAX(dsty, 0);
-                uint16_t *mctmp    = s->mctmp + y*rowheight;
-                DiracBlock *blocks = s->blmotion + y*s->blwidth;
+            memset(mctmp+2*p->yoffset*p->stride, 0, 2*rowheight);
+            mc_row(s, blocks, mctmp, jobnr, dsty);
 
-                init_obmc_weights(s, p, y);
+            mctmp += (start - dsty)*p->stride + p->xoffset;
+            ff_spatial_idwt_slice2(&d, start + h); /* decode */
+            /* NOTE: add_rect_clamped hasn't been templated hence the shifts.
+             * idwt.stride is passed as pixels, not in bytes as in the rest of the decoder */
+            s->diracdsp.add_rect_clamped(frame + start*p->stride, mctmp, p->stride,
+                                         (int16_t*)(p->idwt.buf) + start*(p->idwt.stride >> 1), (p->idwt.stride >> 1), p->width, h);
 
-                if (y == s->blheight-1 || start+p->ybsep > p->height)
-                    h = p->height - start;
-                else
-                    h = p->ybsep - (start - dsty);
-                if (h < 0)
-                    break;
+            dsty += p->ybsep;
+        }
+    }
 
-                memset(mctmp+2*p->yoffset*p->stride, 0, 2*rowheight);
-                mc_row(s, blocks, mctmp, comp, dsty);
+    return 0;
+}
 
-                mctmp += (start - dsty)*p->stride + p->xoffset;
-                ff_spatial_idwt_slice2(&d, start + h); /* decode */
-                /* NOTE: add_rect_clamped hasn't been templated hence the shifts.
-                 * idwt.stride is passed as pixels, not in bytes as in the rest of the decoder */
-                s->diracdsp.add_rect_clamped(frame + start*p->stride, mctmp, p->stride,
-                                             (int16_t*)(p->idwt.buf) + start*(p->idwt.stride >> 1), (p->idwt.stride >> 1), p->width, h);
+/**
+ * Dirac Specification ->
+ * 13.0 Transform data syntax. transform_data()
+ */
+static int dirac_decode_frame_internal(DiracContext *s)
+{
+    int ret, comp, res[3];
 
-                dsty += p->ybsep;
-            }
+    if (s->low_delay) {
+        /* [DIRAC_STD] 13.5.1 low_delay_transform_data() */
+        for (comp = 0; comp < 3; comp++) {
+            Plane *p = &s->plane[comp];
+            memset(p->idwt.buf, 0, p->idwt.stride * p->idwt.height);
+        }
+        if (!s->zero_res) {
+            if ((ret = decode_lowdelay(s)) < 0)
+                return ret;
         }
     }
 
+    s->avctx->execute2(s->avctx, decode_plane, NULL, res, 3);
+    for (comp = 0; comp < 3; comp++)
+        if (res[comp])
+            return res[comp];
 
     return 0;
 }
-- 
2.8.1.369.geae769a



More information about the ffmpeg-devel mailing list