[FFmpeg-devel] [PATCH 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance
Dennis Mungai
dmngaie at gmail.com
Sat Dec 25 07:49:14 EET 2021
On Sat, 25 Dec 2021, 02:23 Ed Martin, <lists at edman007.com> wrote:
> On 10/31/21 22:14, Chen, Wenbin wrote:
> >> Add async_depth to increase encoder's performance. Reuse encode_fifo as
> >> async buffer. Encoder puts all reordered frame to HW and then check
> >> fifo size. If fifo < async_depth and the top frame is not ready, it will
> >> return AVERROR(EAGAIN) to require more frames.
> >>
> >> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
> >> performance on my environment.
> >> The async increases performance but also introduces frame delay.
> >>
> >> Signed-off-by: Wenbin Chen <wenbin.chen at intel.com>
> >> ---
> >> libavcodec/vaapi_encode.c | 20 +++++++++++++++-----
> >> libavcodec/vaapi_encode.h | 12 ++++++++++--
> >> 2 files changed, 25 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> >> index db0ae136a1..616fb7c089 100644
> >> --- a/libavcodec/vaapi_encode.c
> >> +++ b/libavcodec/vaapi_encode.c
> >> @@ -1158,7 +1158,8 @@ static int
> >> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
> >> if (ctx->input_order == ctx->decode_delay)
> >> ctx->dts_pts_diff = pic->pts - ctx->first_pts;
> >> if (ctx->output_delay > 0)
> >> - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] =
> pic->pts;
> >> + ctx->ts_ring[ctx->input_order %
> >> + (3 * ctx->output_delay + ctx->async_depth)] =
> pic->pts;
> >>
> >> pic->display_order = ctx->input_order;
> >> ++ctx->input_order;
> >> @@ -1212,7 +1213,8 @@ int
> >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> >> return AVERROR(EAGAIN);
> >> }
> >>
> >> - while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES *
> >> sizeof(VAAPIEncodePicture *)) {
> >> + while (av_fifo_size(ctx->encode_fifo) <
> >> + MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) {
> >> pic = NULL;
> >> err = vaapi_encode_pick_next(avctx, &pic);
> >> if (err < 0)
> >> @@ -1234,6 +1236,14 @@ int
> >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> >> if (!av_fifo_size(ctx->encode_fifo))
> >> return err;
> >>
> >> + if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth *
> >> sizeof(VAAPIEncodePicture *) &&
> >> + !ctx->end_of_stream) {
> >> + av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic),
> NULL);
> >> + err = vaapi_encode_wait(avctx, pic, 0);
> >> + if (err < 0)
> >> + return err;
> >> + }
> >> +
> >> av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
> >> ctx->encode_order = pic->encode_order + 1;
> >>
> >> @@ -1252,7 +1262,7 @@ int
> >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> >> pkt->dts = ctx->ts_ring[pic->encode_order] -
> ctx->dts_pts_diff;
> >> } else {
> >> pkt->dts = ctx->ts_ring[(pic->encode_order -
> ctx->decode_delay) %
> >> - (3 * ctx->output_delay)];
> >> + (3 * ctx->output_delay +
> ctx->async_depth)];
> >> }
> >> av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64"
> >> dts %"PRId64".\n",
> >> pkt->pts, pkt->dts);
> >> @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext
> >> *avctx)
> >> }
> >> }
> >>
> >> - ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
> >> - sizeof(VAAPIEncodePicture *));
> >> + ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH *
> >> + sizeof(VAAPIEncodePicture *));
> >> if (!ctx->encode_fifo)
> >> return AVERROR(ENOMEM);
> >>
> >> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> >> index 89fe8de466..1bf5d7c337 100644
> >> --- a/libavcodec/vaapi_encode.h
> >> +++ b/libavcodec/vaapi_encode.h
> >> @@ -48,6 +48,7 @@ enum {
> >> MAX_TILE_ROWS = 22,
> >> // A.4.1: table A.6 allows at most 20 tile columns for any level.
> >> MAX_TILE_COLS = 20,
> >> + MAX_ASYNC_DEPTH = 64,
> >> };
> >>
> >> extern const AVCodecHWConfigInternal *const
> >> ff_vaapi_encode_hw_configs[];
> >> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
> >> // Timestamp handling.
> >> int64_t first_pts;
> >> int64_t dts_pts_diff;
> >> - int64_t ts_ring[MAX_REORDER_DELAY * 3];
> >> + int64_t ts_ring[MAX_REORDER_DELAY * 3 +
> >> + MAX_ASYNC_DEPTH];
> >>
> >> // Slice structure.
> >> int slice_block_rows;
> >> @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext {
> >> AVFrame *frame;
> >>
> >> AVFifoBuffer *encode_fifo;
> >> +
> >> + int async_depth;
> >> } VAAPIEncodeContext;
> >>
> >> enum {
> >> @@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
> >> { "b_depth", \
> >> "Maximum B-frame reference depth", \
> >> OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
> >> - { .i64 = 1 }, 1, INT_MAX, FLAGS }
> >> + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
> >> + { "async_depth", "Maximum processing parallelism. " \
> >> + "Increase this to improve single channel performance", \
> >> + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
> >> + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
> >>
> >> #define VAAPI_ENCODE_RC_MODE(name, desc) \
> >> { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name },
> \
> >> --
> >> 2.25.1
> > ping
>
> I tested this patchset and I can confirm that it solves my bug that I
> thought was a mesa bug
> (https://gitlab.freedesktop.org/mesa/mesa/-/issues/1235)
>
>
> I would love if this feature is incorporated into ffmpeg
>
>
> Indeed, this is the only patch that makes AMD GPUs usable with VAAPI.
More information about the ffmpeg-devel
mailing list