[FFmpeg-devel] [PATCH] simplify dnxhd threading

Fri Sep 25 18:30:42 CEST 2009

Hello,
attached patch changes dnxhd encoder to create one task per slice
instead of one per thread.
Since the overhead of having more tasks (max is 64 or so anyway) is
minimal, this causes no slowdown, but allows removing of the outer loops
of several functions and more simplifications (and also gets rid of the
error of -threads is > than number of slices).
It also means that e.g. on a system with two very differently loaded
CPUs and using only two threads, each thread will dynamically process as
much as it can, instead of one thread being idle while the other one has
barely started processing.
I have tested it to not be slower as the original code, but that test
was probably useless since with the options I used encoding did not get
any faster using threads...
-------------- next part --------------
Index: libavcodec/dnxhdenc.c
===================================================================

--- libavcodec/dnxhdenc.c	(revision 20013)
+++ libavcodec/dnxhdenc.c	(working copy)
@@ -211,20 +211,15 @@
     ctx->frame.pict_type = FF_I_TYPE;
     ctx->m.avctx->coded_frame = &ctx->frame;
 
-    if (avctx->thread_count > MAX_THREADS || (avctx->thread_count > ctx->m.mb_height)) {
-        av_log(avctx, AV_LOG_ERROR, "too many threads\n");
-        return -1;
-    }
-
     ctx->thread[0] = ctx;
-    for (i = 1; i < avctx->thread_count; i++) {
+    for (i = 1; i < ctx->m.mb_height; i++) {
         ctx->thread[i] =  av_malloc(sizeof(DNXHDEncContext));
         memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext));
     }
 
-    for (i = 0; i < avctx->thread_count; i++) {
-        ctx->thread[i]->m.start_mb_y = (ctx->m.mb_height*(i  ) + avctx->thread_count/2) / avctx->thread_count;
-        ctx->thread[i]->m.end_mb_y   = (ctx->m.mb_height*(i+1) + avctx->thread_count/2) / avctx->thread_count;
+    for (i = 0; i < ctx->m.mb_height; i++) {
+        ctx->thread[i]->m.start_mb_y = i;
+        ctx->thread[i]->m.end_mb_y   = i+1;
     }
 
     return 0;
@@ -403,7 +398,7 @@
     int mb_y, mb_x;
     int qscale = ctx->thread[0]->qscale;
 
-    for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) {
+    mb_y = ctx->m.start_mb_y;
         ctx->m.last_dc[0] =
         ctx->m.last_dc[1] =
         ctx->m.last_dc[2] = 1024;
@@ -443,7 +438,6 @@
             ctx->mb_rc[qscale][mb].ssd = ssd;
             ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0];
         }
-    }
     return 0;
 }
 
@@ -452,7 +446,7 @@
     DNXHDEncContext *ctx = *(void**)arg;
     int mb_y, mb_x;
 
-    for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) {
+    mb_y = ctx->m.start_mb_y;
         ctx->m.last_dc[0] =
         ctx->m.last_dc[1] =
         ctx->m.last_dc[2] = 1024;
@@ -477,7 +471,6 @@
         }
         if (put_bits_count(&ctx->m.pb)&31)
             put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0);
-    }
     flush_put_bits(&ctx->m.pb);
     return 0;
 }
@@ -486,9 +479,9 @@
 {
     int mb_y, mb_x;
     int i, offset = 0;
-    for (i = 0; i < ctx->m.avctx->thread_count; i++) {
-        int thread_size = 0;
-        for (mb_y = ctx->thread[i]->m.start_mb_y; mb_y < ctx->thread[i]->m.end_mb_y; mb_y++) {
+    for (i = 0; i < ctx->m.mb_height; i++) {
+        int thread_size;
+        mb_y = ctx->thread[i]->m.start_mb_y;
             ctx->slice_size[mb_y] = 0;
             for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
                 unsigned mb = mb_y * ctx->m.mb_width + mb_x;
@@ -496,8 +489,7 @@
             }
             ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31;
             ctx->slice_size[mb_y] >>= 3;
-            thread_size += ctx->slice_size[mb_y];
-        }
+            thread_size = ctx->slice_size[mb_y];
         init_put_bits(&ctx->thread[i]->m.pb, buf + 640 + offset, thread_size);
         offset += thread_size;
     }
@@ -507,7 +499,7 @@
 {
     DNXHDEncContext *ctx = *(void**)arg;
     int mb_y, mb_x;
-    for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) {
+    mb_y = ctx->m.start_mb_y;
         for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
             unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
             uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4);
@@ -516,7 +508,6 @@
             ctx->mb_cmp[mb].value = varc;
             ctx->mb_cmp[mb].mb = mb;
         }
-    }
     return 0;
 }
 
@@ -528,7 +519,7 @@
 
     for (q = 1; q < avctx->qmax; q++) {
         ctx->qscale = q;
-        avctx->execute(avctx, dnxhd_calc_bits_thread, (void**)&ctx->thread[0], NULL, avctx->thread_count, sizeof(void*));
+        avctx->execute(avctx, dnxhd_calc_bits_thread, (void**)&ctx->thread[0], NULL, ctx->m.mb_height, sizeof(void*));
     }
     up_step = down_step = 2<<LAMBDA_FRAC_BITS;
     lambda = ctx->lambda;
@@ -608,7 +599,7 @@
         bits = 0;
         ctx->qscale = qscale;
         // XXX avoid recalculating bits
-        ctx->m.avctx->execute(ctx->m.avctx, dnxhd_calc_bits_thread, (void**)&ctx->thread[0], NULL, ctx->m.avctx->thread_count, sizeof(void*));
+        ctx->m.avctx->execute(ctx->m.avctx, dnxhd_calc_bits_thread, (void**)&ctx->thread[0], NULL, ctx->m.mb_height, sizeof(void*));
         for (y = 0; y < ctx->m.mb_height; y++) {
             for (x = 0; x < ctx->m.mb_width; x++)
                 bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits;
@@ -732,7 +723,7 @@
     }
     if (!ret) {
         if (RC_VARIANCE)
-            avctx->execute(avctx, dnxhd_mb_var_thread, (void**)&ctx->thread[0], NULL, avctx->thread_count, sizeof(void*));
+            avctx->execute(avctx, dnxhd_mb_var_thread, (void**)&ctx->thread[0], NULL, ctx->m.mb_height, sizeof(void*));
         radix_sort(ctx->mb_cmp, ctx->m.mb_num);
         for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) {
             int mb = ctx->mb_cmp[x].mb;
@@ -753,7 +744,7 @@
         ctx->frame.linesize[i] = frame->linesize[i];
     }
 
-    for (i = 0; i < ctx->m.avctx->thread_count; i++) {
+    for (i = 0; i < ctx->m.mb_height; i++) {
         ctx->thread[i]->m.linesize    = ctx->frame.linesize[0]<<ctx->interlaced;
         ctx->thread[i]->m.uvlinesize  = ctx->frame.linesize[1]<<ctx->interlaced;
         ctx->thread[i]->dct_y_offset  = ctx->m.linesize  *8;
@@ -804,7 +795,7 @@
         assert(!(ctx->slice_size[i] & 3));
     }
 
-    avctx->execute(avctx, dnxhd_encode_thread, (void**)&ctx->thread[0], NULL, avctx->thread_count, sizeof(void*));
+    avctx->execute(avctx, dnxhd_encode_thread, (void**)&ctx->thread[0], NULL, ctx->m.mb_height, sizeof(void*));
 
     assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
     memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640);
@@ -846,7 +837,7 @@
     av_freep(&ctx->qmatrix_c16);
     av_freep(&ctx->qmatrix_l16);
 
-    for (i = 1; i < avctx->thread_count; i++)
+    for (i = 1; i < ctx->m.mb_height; i++)
         av_freep(&ctx->thread[i]);
 
     return 0;
Index: libavcodec/dnxhdenc.h
===================================================================
--- libavcodec/dnxhdenc.h	(revision 20013)
+++ libavcodec/dnxhdenc.h	(working copy)
@@ -38,6 +38,8 @@
     int bits;
 } RCEntry;
 
+#define MAX_SLICES (1088 / 16)
+
 typedef struct DNXHDEncContext {
     MpegEncContext m; ///< Used for quantization dsp functions
 
@@ -47,7 +49,7 @@
     uint8_t *msip; ///< Macroblock Scan Indexes Payload
     uint32_t *slice_size;
 
-    struct DNXHDEncContext *thread[MAX_THREADS];
+    struct DNXHDEncContext *thread[MAX_SLICES];
 
     unsigned dct_y_offset;
     unsigned dct_uv_offset;