[FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls

Mon Dec 31 23:01:45 EET 2018

fdct done twice for each block. first time during quant calculation, second during slice encoding. so if we pre-save dct coefficients no need to do fdct second time.
disadvantages: requires more memory
advantages: improves performance ~4-5%
---
 libavcodec/proresenc_kostya.c | 74 ++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 22 deletions(-)

diff --git a/libavcodec/proresenc_kostya.c b/libavcodec/proresenc_kostya.c
index e045a972f1..d2f81e73f4 100644
--- a/libavcodec/proresenc_kostya.c
+++ b/libavcodec/proresenc_kostya.c
@@ -219,7 +219,6 @@ struct TrellisNode {
 #define MAX_STORED_Q 16
 
 typedef struct ProresThreadData {
-    DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
     DECLARE_ALIGNED(16, uint16_t, emu_buf)[16 * 16];
     int16_t custom_q[64];
     int16_t custom_chroma_q[64];
@@ -228,7 +227,6 @@ typedef struct ProresThreadData {
 
 typedef struct ProresContext {
     AVClass *class;
-    DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
     DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
     int16_t quants[MAX_STORED_Q][64];
     int16_t quants_chroma[MAX_STORED_Q][64];
@@ -237,6 +235,7 @@ typedef struct ProresContext {
     const uint8_t *quant_mat;
     const uint8_t *quant_chroma_mat;
     const uint8_t *scantable;
+    int16_t *blocks[MAX_PLANES];
 
     void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src,
                  ptrdiff_t linesize, int16_t *block);
@@ -562,6 +561,8 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
     int plane_factor, is_chroma;
     uint16_t *qmat;
     uint16_t *qmat_chroma;
+    int16_t *blocks;
+    DECLARE_ALIGNED(16, int16_t, dct_blocks)[16 * 16 * MAX_MBS_PER_SLICE];
 
     if (ctx->pictures_per_frame == 1)
         line_add = 0;
@@ -604,28 +605,38 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
         src = (const uint16_t*)(pic->data[i] + yp * linesize +
                                 line_add * pic->linesize[i]) + xp;
 
+        if (!ctx->force_quant) {
+            blocks = ctx->blocks[i] + (y * ctx->slices_width * ctx->mbs_per_slice + x) * 16 * 16;
+        } else {
+            blocks = dct_blocks;
+        }
+
         if (i < 3) {
-            get_slice_data(ctx, src, linesize, xp, yp,
-                           pwidth, avctx->height / ctx->pictures_per_frame,
-                           ctx->blocks[0], ctx->emu_buf,
-                           mbs_per_slice, num_cblocks, is_chroma);
+            if (ctx->force_quant) {
+                get_slice_data(ctx, src, linesize, xp, yp,
+                               pwidth, avctx->height / ctx->pictures_per_frame,
+                               blocks, ctx->emu_buf,
+                               mbs_per_slice, num_cblocks, is_chroma);
+            }
             if (!is_chroma) {/* luma quant */
                 sizes[i] = encode_slice_plane(ctx, pb, src, linesize,
-                                              mbs_per_slice, ctx->blocks[0],
+                                              mbs_per_slice, blocks,
                                               num_cblocks, plane_factor,
                                               qmat);
             } else { /* chroma plane */
                 sizes[i] = encode_slice_plane(ctx, pb, src, linesize,
-                                              mbs_per_slice, ctx->blocks[0],
+                                              mbs_per_slice, blocks,
                                               num_cblocks, plane_factor,
                                               qmat_chroma);
             }
         } else {
-            get_alpha_data(ctx, src, linesize, xp, yp,
-                           pwidth, avctx->height / ctx->pictures_per_frame,
-                           ctx->blocks[0], mbs_per_slice, ctx->alpha_bits);
+            if (ctx->force_quant) {
+                get_alpha_data(ctx, src, linesize, xp, yp,
+                               pwidth, avctx->height / ctx->pictures_per_frame,
+                               blocks, mbs_per_slice, ctx->alpha_bits);
+            }
             sizes[i] = encode_alpha_plane(ctx, pb, mbs_per_slice,
-                                          ctx->blocks[0], quant);
+                                          blocks, quant);
         }
         total_size += sizes[i];
         if (put_bits_left(pb) < 0) {
@@ -730,15 +741,15 @@ static int estimate_slice_plane(ProresContext *ctx, int *error, int plane,
                                 const uint16_t *src, ptrdiff_t linesize,
                                 int mbs_per_slice,
                                 int blocks_per_mb, int plane_size_factor,
-                                const int16_t *qmat, ProresThreadData *td)
+                                const int16_t *qmat, int16_t *blocks)
 {
     int blocks_per_slice;
     int bits;
 
     blocks_per_slice = mbs_per_slice * blocks_per_mb;
 
-    bits  = estimate_dcs(error, td->blocks[plane], blocks_per_slice, qmat[0]);
-    bits += estimate_acs(error, td->blocks[plane], blocks_per_slice,
+    bits  = estimate_dcs(error, blocks, blocks_per_slice, qmat[0]);
+    bits += estimate_acs(error, blocks, blocks_per_slice,
                          plane_size_factor, ctx->scantable, qmat);
 
     return FFALIGN(bits, 8);
@@ -819,6 +830,7 @@ static int find_slice_quant(AVCodecContext *avctx,
     int overquant;
     uint16_t *qmat;
     uint16_t *qmat_chroma;
+    int16_t *blocks[MAX_PLANES];
     int linesize[4], line_add;
     int alpha_bits = 0;
 
@@ -848,16 +860,17 @@ static int find_slice_quant(AVCodecContext *avctx,
         linesize[i] = ctx->pic->linesize[i] * ctx->pictures_per_frame;
         src = (const uint16_t *)(ctx->pic->data[i] + yp * linesize[i] +
                                  line_add * ctx->pic->linesize[i]) + xp;
+        blocks[i] = ctx->blocks[i] + (y * ctx->slices_width * ctx->mbs_per_slice + x) * 16 * 16;
 
         if (i < 3) {
             get_slice_data(ctx, src, linesize[i], xp, yp,
                            pwidth, avctx->height / ctx->pictures_per_frame,
-                           td->blocks[i], td->emu_buf,
+                           blocks[i], td->emu_buf,
                            mbs_per_slice, num_cblocks[i], is_chroma[i]);
         } else {
             get_alpha_data(ctx, src, linesize[i], xp, yp,
                            pwidth, avctx->height / ctx->pictures_per_frame,
-                           td->blocks[i], mbs_per_slice, ctx->alpha_bits);
+                           blocks[i], mbs_per_slice, ctx->alpha_bits);
         }
     }
 
@@ -868,7 +881,7 @@ static int find_slice_quant(AVCodecContext *avctx,
 
     if (ctx->alpha_bits)
         alpha_bits = estimate_alpha_plane(ctx, src, linesize[3],
-                                          mbs_per_slice, td->blocks[3]);
+                                          mbs_per_slice, blocks[3]);
     // todo: maybe perform coarser quantising to fit into frame size when needed
     for (q = min_quant; q <= max_quant; q++) {
         bits  = alpha_bits;
@@ -877,13 +890,13 @@ static int find_slice_quant(AVCodecContext *avctx,
                                      src, linesize[0],
                                      mbs_per_slice,
                                      num_cblocks[0], plane_factor[0],
-                                     ctx->quants[q], td); /* estimate luma plane */
+                                     ctx->quants[q], blocks[0]); /* estimate luma plane */
         for (i = 1; i < ctx->num_planes - !!ctx->alpha_bits; i++) { /* estimate chroma plane */
             bits += estimate_slice_plane(ctx, &error, i,
                                          src, linesize[i],
                                          mbs_per_slice,
                                          num_cblocks[i], plane_factor[i],
-                                         ctx->quants_chroma[q], td);
+                                         ctx->quants_chroma[q], blocks[i]);
         }
         if (bits > 65000 * 8)
             error = SCORE_LIMIT;
@@ -914,13 +927,13 @@ static int find_slice_quant(AVCodecContext *avctx,
                                          src, linesize[0],
                                          mbs_per_slice,
                                          num_cblocks[0], plane_factor[0],
-                                         qmat, td);/* estimate luma plane */
+                                         qmat, blocks[0]);/* estimate luma plane */
             for (i = 1; i < ctx->num_planes - !!ctx->alpha_bits; i++) { /* estimate chroma plane */
                 bits += estimate_slice_plane(ctx, &error, i,
                                              src, linesize[i],
                                              mbs_per_slice,
                                              num_cblocks[i], plane_factor[i],
-                                             qmat_chroma, td);
+                                             qmat_chroma, blocks[i]);
             }
             if (bits <= ctx->bits_per_mb * mbs_per_slice)
                 break;
@@ -1167,6 +1180,10 @@ static av_cold int encode_close(AVCodecContext *avctx)
     av_freep(&ctx->tdata);
     av_freep(&ctx->slice_q);
 
+    for (i = 0; i < MAX_PLANES; i++) {
+        av_freep(&ctx->blocks[i]);
+    }
+
     return 0;
 }
 
@@ -1319,6 +1336,19 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 ctx->tdata[j].nodes[i].score     = 0;
             }
         }
+
+        for (j = 0; j < MAX_PLANES; j++) {
+            ctx->blocks[j] = av_malloc(16 * 16
+                                       * ctx->slices_width
+                                       * ctx->mb_height
+                                       * ctx->mbs_per_slice
+                                       * sizeof(*ctx->blocks[0]));
+
+            if (!ctx->blocks[j]) {
+                encode_close(avctx);
+                return AVERROR(ENOMEM);
+            }
+        }
     } else {
         int ls = 0;
         int ls_chroma = 0;
-- 
2.19.0