[FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support
lance.lmwang at gmail.com
lance.lmwang at gmail.com
Fri Sep 6 18:28:29 EEST 2019
From: Limin Wang <lance.lmwang at gmail.com>
The multithread is avoid one core cpu is full with other filter like scale etc.
About the performance, the gain is very small, below is my testing for
performance.
In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
only.
./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
~/Movies/1.v210
master:
./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
-f null -
frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
overhead: unknown
bench: utime=10.082s stime=13.784s rtime=23.889s
bench: maxrss=147836928kB
patch applied:
./ffmpeg -threads 4 -thread_type frame+slice -s 4096x3072 -stream_loop 100 -i
~/Movies/1.v210 -benchmark -f null -
frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
overhead: unknown
bench: utime=11.407s stime=17.258s rtime=18.279s
bench: maxrss=442884096kB
Signed-off-by: Limin Wang <lance.lmwang at gmail.com>
---
libavcodec/v210dec.c | 135 +++++++++++++++++++++++++++++++++------------------
libavcodec/v210dec.h | 1 +
2 files changed, 88 insertions(+), 48 deletions(-)
diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index 6ce18aa..2cdb99e 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -28,6 +28,7 @@
#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/intreadwrite.h"
+#include "thread.h"
#define READ_PIXELS(a, b, c) \
do { \
@@ -37,6 +38,13 @@
*c++ = (val >> 20) & 0x3FF; \
} while (0)
+#define MAX_SLICES 32
+typedef struct ThreadData {
+ AVFrame *frame;
+ uint8_t *buf;
+ int stride;
+} ThreadData;
+
static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
{
uint32_t val;
@@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
s->aligned_input = 0;
ff_v210dec_init(s);
+ s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
return 0;
}
-static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
- AVPacket *avpkt)
+static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int nb_jobs)
{
V210DecContext *s = avctx->priv_data;
-
- int h, w, ret, stride, aligned_input;
- AVFrame *pic = data;
- const uint8_t *psrc = avpkt->data;
+ int h, w;
+ ThreadData *td = arg;
+ AVFrame *frame = td->frame;
+ int stride = td->stride;
+ int slice_h = avctx->height / s->slice_count;
+ int slice_m = avctx->height % s->slice_count;
+ int slice_start = jobnr * slice_h;
+ int slice_end = slice_start + slice_h;
+ const uint8_t *psrc = td->buf + stride * slice_start;
uint16_t *y, *u, *v;
- if (s->custom_stride )
- stride = s->custom_stride;
- else {
- int aligned_width = ((avctx->width + 47) / 48) * 48;
- stride = aligned_width * 8 / 3;
- }
-
- if (avpkt->size < stride * avctx->height) {
- if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
- stride = avpkt->size / avctx->height;
- if (!s->stride_warning_shown)
- av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n");
- s->stride_warning_shown = 1;
- } else {
- av_log(avctx, AV_LOG_ERROR, "packet too small\n");
- return AVERROR_INVALIDDATA;
- }
- }
- if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
- && AV_RN32(psrc) == AV_RN32("INFO")
- && avpkt->size - 64 >= stride * avctx->height)
- psrc += 64;
-
- aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
- if (aligned_input != s->aligned_input) {
- s->aligned_input = aligned_input;
- ff_v210dec_init(s);
- }
-
- if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
- return ret;
-
- y = (uint16_t*)pic->data[0];
- u = (uint16_t*)pic->data[1];
- v = (uint16_t*)pic->data[2];
- pic->pict_type = AV_PICTURE_TYPE_I;
- pic->key_frame = 1;
+ /* add the remaining slice for the last job */
+ if (jobnr == s->slice_count - 1)
+ slice_end += slice_m;
- for (h = 0; h < avctx->height; h++) {
+ y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
+ u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
+ v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2;
+ for (h = slice_start; h < slice_end; h++) {
const uint32_t *src = (const uint32_t*)psrc;
uint32_t val;
@@ -154,10 +136,63 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
}
psrc += stride;
- y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
- u += pic->linesize[1] / 2 - avctx->width / 2;
- v += pic->linesize[2] / 2 - avctx->width / 2;
+ y += frame->linesize[0] / 2 - avctx->width + (avctx->width & 1);
+ u += frame->linesize[1] / 2 - avctx->width / 2;
+ v += frame->linesize[2] / 2 - avctx->width / 2;
+ }
+
+ return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+ AVPacket *avpkt)
+{
+ V210DecContext *s = avctx->priv_data;
+ ThreadData td;
+ int ret, stride, aligned_input;
+ ThreadFrame frame = { .f = data };
+ AVFrame *pic = data;
+ const uint8_t *psrc = avpkt->data;
+
+ if (s->custom_stride )
+ stride = s->custom_stride;
+ else {
+ int aligned_width = ((avctx->width + 47) / 48) * 48;
+ stride = aligned_width * 8 / 3;
+ }
+ td.stride = stride;
+
+ if (avpkt->size < stride * avctx->height) {
+ if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
+ stride = avpkt->size / avctx->height;
+ if (!s->stride_warning_shown)
+ av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n");
+ s->stride_warning_shown = 1;
+ } else {
+ av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+ return AVERROR_INVALIDDATA;
+ }
}
+ if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
+ && AV_RN32(psrc) == AV_RN32("INFO")
+ && avpkt->size - 64 >= stride * avctx->height)
+ psrc += 64;
+
+ aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
+ if (aligned_input != s->aligned_input) {
+ s->aligned_input = aligned_input;
+ ff_v210dec_init(s);
+ }
+
+ if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+ return ret;
+
+ pic->pict_type = AV_PICTURE_TYPE_I;
+ pic->key_frame = 1;
+
+ td.buf = (uint8_t*)psrc;
+ td.frame = pic;
+ avctx->execute2(avctx, v210_decode_slice, &td, NULL, s->slice_count);
if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
/* we have interlaced material flagged in container */
@@ -193,6 +228,10 @@ AVCodec ff_v210_decoder = {
.priv_data_size = sizeof(V210DecContext),
.init = decode_init,
.decode = decode_frame,
- .capabilities = AV_CODEC_CAP_DR1,
+ .capabilities = AV_CODEC_CAP_DR1 |
+ AV_CODEC_CAP_SLICE_THREADS |
+ AV_CODEC_CAP_FRAME_THREADS,
.priv_class = &v210dec_class,
+ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE |
+ FF_CODEC_CAP_INIT_CLEANUP,
};
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index cfdb29d..3581943 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -26,6 +26,7 @@
typedef struct {
AVClass *av_class;
int custom_stride;
+ int slice_count; // Number of slices for threaded operations
int aligned_input;
int stride_warning_shown;
void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
--
2.6.4
More information about the ffmpeg-devel
mailing list