[FFmpeg-devel] [PATCH] avcodec/amfenc: DX12 Reference-only feature support

Fri Jan 31 03:22:00 EET 2025

On 2025-01-30 9:09 a.m., Araz Iusubov wrote:
> The Reference-Only feature in DirectX 12 is a memory optimization
> technique designed for video decoding scenarios.
> This feature requires that reference resources must be allocated with
> the D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY resource flag.
> Reference textures must also be separated from output textures.
> This feature is not supported in the current version of ffmpeg.
> Since AMD GPU uses this feature in Direct 12 decoder,
> ffmpeg does not support AMD GPU Direct 12 decoding.
> To properly support the Reference-Only feature,
> two parallel resource pools must be configured and managed:
> General Resource Pool:
> Contains resources used for output decoded frames.
> Defined in AVHWFramesContext and manages the final decoded textures.
> Reference-Only Resource Pool:
> Intended for storing reference frame resources.
> Resources created with the
> D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY flag
> are allocated to AVBufferPool.
> 
> ---
>   libavcodec/d3d12va_decode.c   | 58 ++++++++++++++++++++++++++++---
>   libavutil/hwcontext_d3d12va.c | 65 ++++++++++++++++++++++++++++++++---
>   2 files changed, 115 insertions(+), 8 deletions(-)

This patch only affects d3d12va, why is the commit message amfenc?

> 
> diff --git a/libavcodec/d3d12va_decode.c b/libavcodec/d3d12va_decode.c
> index 3b8978635e..8916f94d10 100644
> --- a/libavcodec/d3d12va_decode.c
> +++ b/libavcodec/d3d12va_decode.c
> @@ -51,11 +51,19 @@ unsigned ff_d3d12va_get_surface_index(const AVCodecContext *avctx,
>                                         D3D12VADecodeContext *ctx, const AVFrame *frame,
>                                         int curr)
>   {
> +    AVHWFramesContext      *frames_ctx   = D3D12VA_FRAMES_CONTEXT(avctx);
> +    AVD3D12VAFramesContext *frames_hwctx = frames_ctx->hwctx;
> +
>       AVD3D12VAFrame *f;
>       ID3D12Resource *res;
>       unsigned i;
>   
> -    f = (AVD3D12VAFrame *)frame->data[0];
> +    if (frames_hwctx->flags & D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY) {
> +        f = (AVD3D12VAFrame*)frame->data[1];
> +    } else {
> +        f = (AVD3D12VAFrame*)frame->data[0];
> +    }
> +
>       if (!f)
>           goto fail;
>   
> @@ -250,6 +258,11 @@ static int d3d12va_create_decoder(AVCodecContext *avctx)
>           return AVERROR_PATCHWELCOME;
>       }
>   
> +    if (feature.ConfigurationFlags & D3D12_VIDEO_DECODE_CONFIGURATION_FLAG_REFERENCE_ONLY_ALLOCATIONS_REQUIRED) {
> +        frames_hwctx->flags |= (D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY | D3D12_RESOURCE_FLAG_DENY_SHADER_RESOURCE);
> +        av_log(avctx, AV_LOG_INFO, "Reference-Only Allocations are required for this configuration.\n");
> +    }
> +
>       desc = (D3D12_VIDEO_DECODER_DESC) {
>           .NodeMask = 0,
>           .Configuration = ctx->cfg,
> @@ -440,8 +453,19 @@ int ff_d3d12va_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
>       D3D12VADecodeContext   *ctx               = D3D12VA_DECODE_CONTEXT(avctx);
>       ID3D12Resource         *buffer            = NULL;
>       ID3D12CommandAllocator *command_allocator = NULL;
> -    AVD3D12VAFrame         *f                 = (AVD3D12VAFrame *)frame->data[0];
> -    ID3D12Resource         *resource          = (ID3D12Resource *)f->texture;
> +    AVHWFramesContext      *frames_ctx        = D3D12VA_FRAMES_CONTEXT(avctx);
> +    AVD3D12VAFramesContext *frames_hwctx      = frames_ctx->hwctx;
> +    AVD3D12VAFrame         *f                 = NULL;
> +    AVD3D12VAFrame         *output_data       = NULL;
> +
> +    if (frames_hwctx->flags & D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY) {
> +        f           = (AVD3D12VAFrame*)frame->data[1];
> +        output_data = (AVD3D12VAFrame*)frame->data[0];
> +    } else {
> +        f           = (AVD3D12VAFrame*)frame->data[0];
> +    }
> +
> +    ID3D12Resource* resource = (ID3D12Resource*)f->texture;
>   
>       ID3D12VideoDecodeCommandList *cmd_list = ctx->command_list;
>       D3D12_RESOURCE_BARRIER barriers[32] = { 0 };
> @@ -469,6 +493,14 @@ int ff_d3d12va_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
>           .pOutputTexture2D    = resource,
>       };
>   
> +    if (frames_hwctx->flags & D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY) {
> +        output_args.pOutputTexture2D = output_data->texture;
> +
> +        output_args.ConversionArguments.Enable               = 1;
> +        output_args.ConversionArguments.pReferenceTexture2D  = resource;
> +        output_args.ConversionArguments.ReferenceSubresource = 0;
> +    }
> +
>       UINT num_barrier = 1;
>       barriers[0] = (D3D12_RESOURCE_BARRIER) {
>           .Type  = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
> @@ -481,6 +513,20 @@ int ff_d3d12va_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
>           },
>       };
>   
> +    if (frames_hwctx->flags & D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY) {
> +        barriers[1] = (D3D12_RESOURCE_BARRIER) {
> +            .Type  = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
> +            .Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE,
> +            .Transition = {
> +                .pResource   = output_data->texture,
> +                .Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES,
> +                .StateBefore = D3D12_RESOURCE_STATE_COMMON,
> +                .StateAfter  = D3D12_RESOURCE_STATE_VIDEO_DECODE_WRITE,
> +            },
> +        };
> +        num_barrier++;
> +    }
> +
>       memset(ctx->ref_subresources, 0, sizeof(UINT) * ctx->max_num_ref);
>       input_args.ReferenceFrames.NumTexture2Ds = ctx->max_num_ref;
>       input_args.ReferenceFrames.ppTexture2Ds  = ctx->ref_resources;
> @@ -505,7 +551,7 @@ int ff_d3d12va_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
>   
>       DX_CHECK(ID3D12VideoDecodeCommandList_Reset(cmd_list, command_allocator));
>   
> -    num_barrier += d3d12va_update_reference_frames_state(avctx, &barriers[1], resource, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_VIDEO_DECODE_READ);
> +    num_barrier += d3d12va_update_reference_frames_state(avctx, &barriers[num_barrier], resource, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_VIDEO_DECODE_READ);
>   

You could optimize these barriers since reference-only resources don't 
need to be transitioned to COMMON, and can remain in 
VIDEO_DECODE_{READ,WRITE}.

I propose the following:
- Transition all reference texture to VIDEO_DECODE_READ at creation time.
- When preparing resources for input to DecodeFrame(), transition only 
the texture for reference output to VIDEO_DECODE_WRITE
- After DecodeFrame(), transition the reference output texture to 
VIDEO_DECODE_READ. This is already implicitly handled by the barrier SWAP.

All-in-all, for the cost of an initial transition at creation time, you 
decrease the number of barriers in each frame to just 2.

>       ID3D12VideoDecodeCommandList_ResourceBarrier(cmd_list, num_barrier, barriers);
>   
> @@ -522,6 +568,10 @@ int ff_d3d12va_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
>   
>       DX_CHECK(ID3D12CommandQueue_Signal(ctx->command_queue, f->sync_ctx.fence, ++f->sync_ctx.fence_value));
>   
> +    if (frames_hwctx->flags & D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY) {
> +        DX_CHECK(ID3D12CommandQueue_Signal(ctx->command_queue, output_data->sync_ctx.fence, ++output_data->sync_ctx.fence_value));
> +    }
> +
>       DX_CHECK(ID3D12CommandQueue_Signal(ctx->command_queue, ctx->sync_ctx.fence, ++ctx->sync_ctx.fence_value));
>   
>       ret = d3d12va_discard_helper_objects(avctx, command_allocator, buffer, ctx->sync_ctx.fence_value);
> diff --git a/libavutil/hwcontext_d3d12va.c b/libavutil/hwcontext_d3d12va.c
> index 6507cf69c1..328827b040 100644
> --- a/libavutil/hwcontext_d3d12va.c
> +++ b/libavutil/hwcontext_d3d12va.c
> @@ -49,6 +49,24 @@ typedef struct D3D12VAFramesContext {
>       ID3D12GraphicsCommandList *command_list;
>       AVD3D12VASyncContext       sync_ctx;
>       UINT                       luma_component_size;
> +
> +    /**
> +     * The Reference-Only feature in DirectX 12 is a memory optimization
> +     * technique designed for video decoding/encoding scenarios.
> +     * This feature requires that reference resources must be allocated
> +     * with the `D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY` resource flag.
> +     * Reference textures must also be separated from output textures.
> +     * To correctly support the Reference-Only feature, two parallel resource
> +     * pools must be configured and managed:
> +     * 1. General Resource Pool:
> +     *   - Contains resources used for outputting decoded frames.
> +     *   - Defined in `AVHWFramesContext` and manages the final decoded textures.
> +     * 2. Reference-Only Resource Pool:
> +     *   - Dedicated to storing reference frame resources.
> +     *   - Resources created with the `D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY`
> +     *     flag are allocated to this pool.
> +     */
> +    AVBufferPool              *pool_reference_only;
>   } D3D12VAFramesContext;
>   
>   typedef struct D3D12VADevicePriv {
> @@ -174,7 +192,8 @@ fail:
>   
>   static void d3d12va_frames_uninit(AVHWFramesContext *ctx)
>   {
> -    D3D12VAFramesContext *s = ctx->hwctx;
> +    D3D12VAFramesContext   *s            = ctx->hwctx;
> +    AVD3D12VAFramesContext *frames_hwctx = &s->p;
>   
>       D3D12_OBJECT_RELEASE(s->sync_ctx.fence);
>       if (s->sync_ctx.event)
> @@ -185,6 +204,11 @@ static void d3d12va_frames_uninit(AVHWFramesContext *ctx)
>       D3D12_OBJECT_RELEASE(s->command_allocator);
>       D3D12_OBJECT_RELEASE(s->command_list);
>       D3D12_OBJECT_RELEASE(s->command_queue);
> +
> +    if (frames_hwctx->flags & D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY) {
> +        if (s->pool_reference_only)
> +            av_buffer_pool_uninit(&s->pool_reference_only);
> +    }
>   }
>   
>   static int d3d12va_frames_get_constraints(AVHWDeviceContext *ctx, const void *hwconfig, AVHWFramesConstraints *constraints)
> @@ -281,6 +305,7 @@ fail:
>   static int d3d12va_frames_init(AVHWFramesContext *ctx)
>   {
>       AVD3D12VAFramesContext *hwctx = ctx->hwctx;
> +    D3D12VAFramesContext   *s     = ctx->hwctx;
>       int i;
>   
>       for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) {
> @@ -304,16 +329,43 @@ static int d3d12va_frames_init(AVHWFramesContext *ctx)
>       if (!ffhwframesctx(ctx)->pool_internal)
>           return AVERROR(ENOMEM);
>   
> +    s->pool_reference_only = NULL;
> +
>       return 0;
>   }
>   
>   static int d3d12va_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
>   {
>       int ret;
> +    D3D12VAFramesContext   *s            = ctx->hwctx;
> +    AVD3D12VAFramesContext *frames_hwctx = &s->p;
>   
> -    frame->buf[0] = av_buffer_pool_get(ctx->pool);
> -    if (!frame->buf[0])
> -        return AVERROR(ENOMEM);
> +    if (frames_hwctx->flags & D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY) {
> +        /*
> +         * for the output texture, temporarily unset D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY
> +         * and D3D12_RESOURCE_FLAG_DENY_SHADER_RESOURCE
> +        */
> +        D3D12_RESOURCE_FLAGS temp_flags = frames_hwctx->flags;
> +        frames_hwctx->flags &= ~(D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY | D3D12_RESOURCE_FLAG_DENY_SHADER_RESOURCE);
> +
> +        frame->buf[0] = av_buffer_pool_get(ctx->pool);
> +        if (!frame->buf[0])
> +            return AVERROR(ENOMEM);
> +
> +        if (s->pool_reference_only == NULL) {
> +            s->pool_reference_only = av_buffer_pool_init2(sizeof(AVD3D12VAFrame),
> +                ctx, d3d12va_pool_alloc, NULL);
> +        }
> +
> +        frames_hwctx->flags = temp_flags;
> +        frame->buf[1] = av_buffer_pool_get(s->pool_reference_only);
> +        if (!frame->buf[1])
> +            return AVERROR(ENOMEM);
> +    } else {
> +        frame->buf[0] = av_buffer_pool_get(ctx->pool);
> +        if (!frame->buf[0])
> +            return AVERROR(ENOMEM);
> +    }
>   
>       ret = av_image_fill_arrays(frame->data, frame->linesize, NULL,
>                                  ctx->sw_format, ctx->width, ctx->height,
> @@ -322,6 +374,11 @@ static int d3d12va_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
>           return ret;
>   
>       frame->data[0] = frame->buf[0]->data;
> +
> +    if (frames_hwctx->flags & D3D12_RESOURCE_FLAG_VIDEO_DECODE_REFERENCE_ONLY) {
> +        frame->data[1] = frame->buf[1]->data;
> +    }
> +
>       frame->format  = AV_PIX_FMT_D3D12;
>       frame->width   = ctx->width;
>       frame->height  = ctx->height;