[FFmpeg-devel] [PATCH] avcodec/d3d12va_encode: texture array support for HEVC

Tue May 6 19:54:59 EEST 2025

This patch adds support for the texture array feature
used by AMD boards in the D3D12 HEVC encoder.
In texture array mode, a single texture array is shared for all
reference and reconstructed pictures using different subresources.
The implementation ensures compatibility
and has been successfully tested on AMD, Intel, and NVIDIA GPUs.

---
 libavcodec/d3d12va_encode.c      | 241 +++++++++++++++++++++++++------
 libavcodec/d3d12va_encode.h      |  29 ++++
 libavcodec/d3d12va_encode_hevc.c |   5 +-
 3 files changed, 231 insertions(+), 44 deletions(-)

diff --git a/libavcodec/d3d12va_encode.c b/libavcodec/d3d12va_encode.c
index 4d738200fe..580d2ea383 100644
--- a/libavcodec/d3d12va_encode.c
+++ b/libavcodec/d3d12va_encode.c
@@ -264,6 +264,11 @@ static int d3d12va_encode_issue(AVCodecContext *avctx,
 
     av_log(avctx, AV_LOG_DEBUG, "Input surface is %p.\n", pic->input_surface->texture);
 
+    if (ctx->is_texture_array) {
+        base_pic->recon_image->data[0] = ctx->texture_array_frame;
+        pic->subresource_index = (ctx->subresource_used_index++) % ctx->max_subresource_array_size;
+    }
+
     pic->recon_surface = (AVD3D12VAFrame *)base_pic->recon_image->data[0];
     av_log(avctx, AV_LOG_DEBUG, "Recon surface is %p.\n",
            pic->recon_surface->texture);
@@ -325,11 +330,28 @@ static int d3d12va_encode_issue(AVCodecContext *avctx,
             goto fail;
         }
 
+        if (ctx->is_texture_array) {
+            d3d12_refs.pSubresources = av_calloc(d3d12_refs.NumTexture2Ds,
+                                                sizeof(*d3d12_refs.pSubresources));
+            if (!d3d12_refs.pSubresources) {
+                err = AVERROR(ENOMEM);
+                goto fail;
+            }
+        }
+
         i = 0;
-        for (j = 0; j < base_pic->nb_refs[0]; j++)
-            d3d12_refs.ppTexture2Ds[i++] = ((D3D12VAEncodePicture *)base_pic->refs[0][j]->priv)->recon_surface->texture;
-        for (j = 0; j < base_pic->nb_refs[1]; j++)
-            d3d12_refs.ppTexture2Ds[i++] = ((D3D12VAEncodePicture *)base_pic->refs[1][j]->priv)->recon_surface->texture;
+        for (j = 0; j < base_pic->nb_refs[0]; j++) {
+            d3d12_refs.ppTexture2Ds[i]  = ((D3D12VAEncodePicture *)base_pic->refs[0][j]->priv)->recon_surface->texture;
+            if (ctx->is_texture_array)
+                d3d12_refs.pSubresources[i] = ((D3D12VAEncodePicture *)base_pic->refs[0][j]->priv)->subresource_index;
+            i++;
+        }
+        for (j = 0; j < base_pic->nb_refs[1]; j++) {
+            d3d12_refs.ppTexture2Ds[i]  = ((D3D12VAEncodePicture *)base_pic->refs[1][j]->priv)->recon_surface->texture;
+            if (ctx->is_texture_array)
+                d3d12_refs.pSubresources[i] = ((D3D12VAEncodePicture *)base_pic->refs[1][j]->priv)->subresource_index;
+            i++;
+        }
     }
 
     input_args.PictureControlDesc.IntraRefreshFrameIndex  = 0;
@@ -343,7 +365,11 @@ static int d3d12va_encode_issue(AVCodecContext *avctx,
     output_args.Bitstream.pBuffer                                    = pic->output_buffer;
     output_args.Bitstream.FrameStartOffset                           = pic->aligned_header_size;
     output_args.ReconstructedPicture.pReconstructedPicture           = pic->recon_surface->texture;
-    output_args.ReconstructedPicture.ReconstructedPictureSubresource = 0;
+    if (ctx->is_texture_array) {
+        output_args.ReconstructedPicture.ReconstructedPictureSubresource = pic->subresource_index;
+    } else {
+        output_args.ReconstructedPicture.ReconstructedPictureSubresource = 0;
+    }
     output_args.EncoderOutputMetadata.pBuffer                        = pic->encoded_metadata;
     output_args.EncoderOutputMetadata.Offset                         = 0;
 
@@ -381,35 +407,87 @@ static int d3d12va_encode_issue(AVCodecContext *avctx,
         },                                                          \
     }
 
+#define TRANSITION_BARRIER_SUBRESOURCE(res, subres,before, after)   \
+    (D3D12_RESOURCE_BARRIER) {                                      \
+        .Type  = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,            \
+        .Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE,                  \
+        .Transition = {                                             \
+            .pResource   = res,                                     \
+            .Subresource = subres,                                  \
+            .StateBefore = before,                                  \
+            .StateAfter  = after,                                   \
+        },                                                          \
+    }
+
     barriers[0] = TRANSITION_BARRIER(pic->input_surface->texture,
                                      D3D12_RESOURCE_STATE_COMMON,
                                      D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ);
     barriers[1] = TRANSITION_BARRIER(pic->output_buffer,
                                      D3D12_RESOURCE_STATE_COMMON,
                                      D3D12_RESOURCE_STATE_VIDEO_ENCODE_WRITE);
-    barriers[2] = TRANSITION_BARRIER(pic->recon_surface->texture,
+    barriers[2] = TRANSITION_BARRIER(pic->encoded_metadata,
                                      D3D12_RESOURCE_STATE_COMMON,
                                      D3D12_RESOURCE_STATE_VIDEO_ENCODE_WRITE);
-    barriers[3] = TRANSITION_BARRIER(pic->encoded_metadata,
+    barriers[3] = TRANSITION_BARRIER(pic->resolved_metadata,
                                      D3D12_RESOURCE_STATE_COMMON,
                                      D3D12_RESOURCE_STATE_VIDEO_ENCODE_WRITE);
-    barriers[4] = TRANSITION_BARRIER(pic->resolved_metadata,
-                                     D3D12_RESOURCE_STATE_COMMON,
-                                     D3D12_RESOURCE_STATE_VIDEO_ENCODE_WRITE);
-
-    ID3D12VideoEncodeCommandList2_ResourceBarrier(cmd_list, 5, barriers);
-
-    if (d3d12_refs.NumTexture2Ds) {
-        D3D12_RESOURCE_BARRIER refs_barriers[3];
-
-        for (i = 0; i < d3d12_refs.NumTexture2Ds; i++)
-            refs_barriers[i] = TRANSITION_BARRIER(d3d12_refs.ppTexture2Ds[i],
-                                                  D3D12_RESOURCE_STATE_COMMON,
-                                                  D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ);
-
-        ID3D12VideoEncodeCommandList2_ResourceBarrier(cmd_list, d3d12_refs.NumTexture2Ds,
-                                                      refs_barriers);
+    ID3D12VideoEncodeCommandList2_ResourceBarrier(cmd_list, 4, barriers);
+
+    //set transit barriers for reference pic and recon pic
+    int barriers_ref_index = 0;
+    D3D12_RESOURCE_BARRIER *barriers_ref = NULL;
+    if (ctx->is_texture_array) {
+        barriers_ref = av_calloc(ctx->max_subresource_array_size * ctx->plane_count,
+            sizeof(D3D12_RESOURCE_BARRIER));
+    } else {
+        barriers_ref = av_calloc(MAX_DPB_SIZE,sizeof(D3D12_RESOURCE_BARRIER));
+    }
+
+    if (ctx->is_texture_array) {
+         // In Texture array mode, the D3D12 uses the same texture array for all the input
+         // reference pics in ppTexture2Ds and also for the pReconstructedPicture output allocations,
+         //just different subresources.
+        D3D12_RESOURCE_DESC references_tex_array_desc = { 0 };
+        pic->recon_surface->texture->lpVtbl->GetDesc(pic->recon_surface->texture, &references_tex_array_desc);
+
+        for (uint32_t reference_subresource = 0; reference_subresource < references_tex_array_desc.DepthOrArraySize;
+            reference_subresource++) {
+
+            //D3D12 DecomposeSubresource
+            uint32_t mip_slice, plane_slice, array_slice, array_size;
+            array_size = references_tex_array_desc.DepthOrArraySize;
+            mip_slice = reference_subresource % references_tex_array_desc.MipLevels;
+            array_slice = (reference_subresource / references_tex_array_desc.MipLevels) % array_size;
+
+            for (plane_slice = 0; plane_slice < ctx->plane_count; plane_slice++) {
+                //Calculate the subresource index
+                uint32_t planeOutputSubresource = mip_slice + array_slice * references_tex_array_desc.MipLevels +
+                                        plane_slice * references_tex_array_desc.MipLevels * array_size;
+
+                if (reference_subresource == pic->subresource_index) {
+                    barriers_ref[barriers_ref_index++] = TRANSITION_BARRIER_SUBRESOURCE(pic->recon_surface->texture, planeOutputSubresource,
+                                        D3D12_RESOURCE_STATE_COMMON,
+                                        D3D12_RESOURCE_STATE_VIDEO_ENCODE_WRITE);
+                } else {
+                    barriers_ref[barriers_ref_index++] = TRANSITION_BARRIER_SUBRESOURCE(pic->recon_surface->texture, planeOutputSubresource,
+                                        D3D12_RESOURCE_STATE_COMMON,
+                                        D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ);
+                }
+            }
+        }
+    } else {
+        barriers_ref[barriers_ref_index++] = TRANSITION_BARRIER(pic->recon_surface->texture,
+                                        D3D12_RESOURCE_STATE_COMMON,
+                                        D3D12_RESOURCE_STATE_VIDEO_ENCODE_WRITE);
+
+        if (d3d12_refs.NumTexture2Ds) {
+            for (i = 0; i < d3d12_refs.NumTexture2Ds; i++)
+                barriers_ref[barriers_ref_index++] = TRANSITION_BARRIER(d3d12_refs.ppTexture2Ds[i],
+                                                    D3D12_RESOURCE_STATE_COMMON,
+                                                    D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ);
+        }
     }
+    ID3D12VideoEncodeCommandList2_ResourceBarrier(cmd_list, barriers_ref_index, barriers_ref);
 
     ID3D12VideoEncodeCommandList2_EncodeFrame(cmd_list, ctx->encoder, ctx->encoder_heap,
                                               &input_args, &output_args);
@@ -422,16 +500,15 @@ static int d3d12va_encode_issue(AVCodecContext *avctx,
 
     ID3D12VideoEncodeCommandList2_ResolveEncoderOutputMetadata(cmd_list, &input_metadata, &output_metadata);
 
-    if (d3d12_refs.NumTexture2Ds) {
-        D3D12_RESOURCE_BARRIER refs_barriers[3];
-
-        for (i = 0; i < d3d12_refs.NumTexture2Ds; i++)
-            refs_barriers[i] = TRANSITION_BARRIER(d3d12_refs.ppTexture2Ds[i],
-                                                  D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ,
-                                                  D3D12_RESOURCE_STATE_COMMON);
-
-        ID3D12VideoEncodeCommandList2_ResourceBarrier(cmd_list, d3d12_refs.NumTexture2Ds,
-                                                      refs_barriers);
+    //swap the barriers_ref transition state
+    if (barriers_ref_index > 0) {
+        for (i = 0; i < barriers_ref_index; i++) {
+            D3D12_RESOURCE_STATES temp_statue = barriers_ref[i].Transition.StateBefore;
+            barriers_ref[i].Transition.StateBefore = barriers_ref[i].Transition.StateAfter;
+            barriers_ref[i].Transition.StateAfter = temp_statue;
+        }
+        ID3D12VideoEncodeCommandList2_ResourceBarrier(cmd_list, barriers_ref_index,
+                                                      barriers_ref);
     }
 
     barriers[0] = TRANSITION_BARRIER(pic->input_surface->texture,
@@ -440,17 +517,14 @@ static int d3d12va_encode_issue(AVCodecContext *avctx,
     barriers[1] = TRANSITION_BARRIER(pic->output_buffer,
                                      D3D12_RESOURCE_STATE_VIDEO_ENCODE_WRITE,
                                      D3D12_RESOURCE_STATE_COMMON);
-    barriers[2] = TRANSITION_BARRIER(pic->recon_surface->texture,
-                                     D3D12_RESOURCE_STATE_VIDEO_ENCODE_WRITE,
-                                     D3D12_RESOURCE_STATE_COMMON);
-    barriers[3] = TRANSITION_BARRIER(pic->encoded_metadata,
+    barriers[2] = TRANSITION_BARRIER(pic->encoded_metadata,
                                      D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ,
                                      D3D12_RESOURCE_STATE_COMMON);
-    barriers[4] = TRANSITION_BARRIER(pic->resolved_metadata,
+    barriers[3] = TRANSITION_BARRIER(pic->resolved_metadata,
                                      D3D12_RESOURCE_STATE_VIDEO_ENCODE_WRITE,
                                      D3D12_RESOURCE_STATE_COMMON);
 
-    ID3D12VideoEncodeCommandList2_ResourceBarrier(cmd_list, 5, barriers);
+    ID3D12VideoEncodeCommandList2_ResourceBarrier(cmd_list, 4, barriers);
 
     hr = ID3D12VideoEncodeCommandList2_Close(cmd_list);
     if (FAILED(hr)) {
@@ -489,6 +563,14 @@ static int d3d12va_encode_issue(AVCodecContext *avctx,
     if (d3d12_refs.ppTexture2Ds)
         av_freep(&d3d12_refs.ppTexture2Ds);
 
+    if (ctx->is_texture_array) {
+        if (d3d12_refs.pSubresources)
+            av_freep(&d3d12_refs.pSubresources);
+    }
+
+    if (barriers_ref)
+        av_freep(&barriers_ref);
+
     return 0;
 
 fail:
@@ -498,6 +580,14 @@ fail:
     if (d3d12_refs.ppTexture2Ds)
         av_freep(&d3d12_refs.ppTexture2Ds);
 
+    if (ctx->is_texture_array) {
+        if (d3d12_refs.pSubresources)
+            av_freep(&d3d12_refs.pSubresources);
+    }
+
+    if (barriers_ref)
+        av_freep(&barriers_ref);
+
     if (ctx->codec->free_picture_params)
         ctx->codec->free_picture_params(pic);
 
@@ -1088,13 +1178,15 @@ static int d3d12va_encode_init_gop_structure(AVCodecContext *avctx)
         switch (ctx->codec->d3d12_codec) {
             case D3D12_VIDEO_ENCODER_CODEC_H264:
                 ref_l0 = FFMIN(support.PictureSupport.pH264Support->MaxL0ReferencesForP,
-                               support.PictureSupport.pH264Support->MaxL1ReferencesForB);
+                               support.PictureSupport.pHEVCSupport->MaxL1ReferencesForB ?
+                               support.PictureSupport.pHEVCSupport->MaxL1ReferencesForB : UINT_MAX);
                 ref_l1 = support.PictureSupport.pH264Support->MaxL1ReferencesForB;
                 break;
 
             case D3D12_VIDEO_ENCODER_CODEC_HEVC:
                 ref_l0 = FFMIN(support.PictureSupport.pHEVCSupport->MaxL0ReferencesForP,
-                               support.PictureSupport.pHEVCSupport->MaxL1ReferencesForB);
+                               support.PictureSupport.pHEVCSupport->MaxL1ReferencesForB ?
+                               support.PictureSupport.pHEVCSupport->MaxL1ReferencesForB : UINT_MAX);
                 ref_l1 = support.PictureSupport.pHEVCSupport->MaxL1ReferencesForB;
                 break;
 
@@ -1336,6 +1428,47 @@ fail:
     return err;
 }
 
+static int d3d12va_create_texture_array(AVHWFramesContext *ctx, D3D12VAEncodeContext *encode_context)
+{
+    AVD3D12VAFramesContext *hwctx        = ctx->hwctx;
+    AVD3D12VADeviceContext *device_hwctx = ctx->device_ctx->hwctx;
+
+    AVD3D12VAFrame *frame;
+    D3D12_HEAP_PROPERTIES props = { .Type = D3D12_HEAP_TYPE_DEFAULT };
+
+    encode_context->max_subresource_array_size = MAX_DPB_SIZE + D3D12VA_VIDEO_ENC_ASYNC_DEPTH + 1;
+
+    D3D12_RESOURCE_DESC desc = {
+        .Dimension        = D3D12_RESOURCE_DIMENSION_TEXTURE2D,
+        .Alignment        = 0,
+        .Width            = ctx->width,
+        .Height           = ctx->height,
+        .DepthOrArraySize = encode_context->max_subresource_array_size,
+        .MipLevels        = 1,
+        .Format           = hwctx->format,
+        .SampleDesc       = {.Count = 1, .Quality = 0 },
+        .Layout           = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+        .Flags            = hwctx->flags,
+    };
+
+    frame = av_mallocz(sizeof(AVD3D12VAFrame));
+    if (!frame)
+        return AVERROR(ENOMEM);
+
+    if (FAILED(ID3D12Device_CreateCommittedResource(device_hwctx->device, &props, D3D12_HEAP_FLAG_NONE, &desc,
+        D3D12_RESOURCE_STATE_COMMON, NULL, &IID_ID3D12Resource, (void **)&frame->texture))) {
+        av_log(ctx, AV_LOG_ERROR, "Could not create the texture\n");
+        return AVERROR(EINVAL);
+    }
+
+    ID3D12Device_CreateFence(device_hwctx->device, 0, D3D12_FENCE_FLAG_NONE,
+                                      &IID_ID3D12Fence, (void **)&frame->sync_ctx.fence);
+
+    frame->sync_ctx.event = CreateEvent(NULL, FALSE, FALSE, NULL);
+    encode_context->texture_array_frame = frame;
+    return 0;
+}
+
 static int d3d12va_encode_create_recon_frames(AVCodecContext *avctx)
 {
     FFHWBaseEncodeContext *base_ctx = avctx->priv_data;
@@ -1394,6 +1527,7 @@ int ff_d3d12va_encode_init(AVCodecContext *avctx)
     FFHWBaseEncodeContext *base_ctx = avctx->priv_data;
     D3D12VAEncodeContext       *ctx = avctx->priv_data;
     D3D12_FEATURE_DATA_VIDEO_FEATURE_AREA_SUPPORT support = { 0 };
+    D3D12_FEATURE_DATA_FORMAT_INFO format_info = {0};
     int err;
     HRESULT hr;
 
@@ -1429,6 +1563,15 @@ int ff_d3d12va_encode_init(AVCodecContext *avctx)
         goto fail;
     }
 
+    format_info.Format = ((AVD3D12VAFramesContext*)base_ctx->input_frames->hwctx)->format;
+    if (FAILED(ID3D12VideoDevice_CheckFeatureSupport(ctx->hwctx->device, D3D12_FEATURE_FORMAT_INFO,
+        &format_info, sizeof(format_info)))) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query format plane count: 0x%x\n", hr);
+        err = AVERROR_EXTERNAL;
+        goto fail;
+    }
+    ctx->plane_count = format_info.PlaneCount;
+
     err = d3d12va_encode_set_profile(avctx);
     if (err < 0)
         goto fail;
@@ -1485,6 +1628,10 @@ int ff_d3d12va_encode_init(AVCodecContext *avctx)
             goto fail;
     }
 
+    if (ctx->is_texture_array) {
+        d3d12va_create_texture_array(base_ctx->recon_frames, avctx->priv_data);
+    }
+
     base_ctx->output_delay = base_ctx->b_per_p;
     base_ctx->decode_delay = base_ctx->max_b_depth;
 
@@ -1528,6 +1675,18 @@ int ff_d3d12va_encode_close(AVCodecContext *avctx)
 
     av_buffer_pool_uninit(&ctx->output_buffer_pool);
 
+    if (ctx->is_texture_array) {
+        ID3D12Resource *pResource = ctx->texture_array_frame->texture;
+        if (pResource) {
+            D3D12_OBJECT_RELEASE(pResource);
+            ctx->texture_array_frame->texture = NULL;
+        }
+        D3D12_OBJECT_RELEASE(ctx->texture_array_frame->sync_ctx.fence);
+        if (ctx->texture_array_frame->sync_ctx.event)
+            CloseHandle(ctx->texture_array_frame->sync_ctx.event);
+        av_free(ctx->texture_array_frame);
+    }
+
     D3D12_OBJECT_RELEASE(ctx->command_list);
     D3D12_OBJECT_RELEASE(ctx->command_queue);
 
diff --git a/libavcodec/d3d12va_encode.h b/libavcodec/d3d12va_encode.h
index 3b0b8153d5..fc31857f1a 100644
--- a/libavcodec/d3d12va_encode.h
+++ b/libavcodec/d3d12va_encode.h
@@ -52,6 +52,8 @@ typedef struct D3D12VAEncodePicture {
     ID3D12Resource *encoded_metadata;
     ID3D12Resource *resolved_metadata;
 
+    int            subresource_index;
+
     D3D12_VIDEO_ENCODER_PICTURE_CONTROL_CODEC_DATA pic_ctl;
 
     int             fence_value;
@@ -189,6 +191,33 @@ typedef struct D3D12VAEncodeContext {
      */
     AVBufferPool *output_buffer_pool;
 
+   /**
+    * Flag indicates if the HW is texture array mode.
+    */
+   int is_texture_array;
+
+   /**
+    * In texture array mode, the D3D12 uses the same texture array for all the input
+    * reference pics in ppTexture2Ds and also for the pReconstructedPicture output
+    * allocations, just different subresources.
+    */
+   AVD3D12VAFrame *texture_array_frame;
+
+   /**
+    * The max number of subresources in the texture array.
+    */
+   int max_subresource_array_size;
+
+   /**
+    * The used subresource index for pic in the texture array.
+    */
+   int subresource_used_index;
+
+   /**
+    * The number of planes in the input DXGI FORMAT .
+    */
+   int plane_count;
+
     /**
      * D3D12 video encoder.
      */
diff --git a/libavcodec/d3d12va_encode_hevc.c b/libavcodec/d3d12va_encode_hevc.c
index 938ba01f54..7e1d973f7e 100644
--- a/libavcodec/d3d12va_encode_hevc.c
+++ b/libavcodec/d3d12va_encode_hevc.c
@@ -280,9 +280,8 @@ static int d3d12va_encode_hevc_init_sequence_params(AVCodecContext *avctx)
     }
 
     if (support.SupportFlags & D3D12_VIDEO_ENCODER_SUPPORT_FLAG_RECONSTRUCTED_FRAMES_REQUIRE_TEXTURE_ARRAYS) {
-        av_log(avctx, AV_LOG_ERROR, "D3D12 video encode on this device requires texture array support, "
-               "but it's not implemented.\n");
-        return AVERROR_PATCHWELCOME;
+        ctx->is_texture_array = 1;
+        av_log(avctx, AV_LOG_DEBUG, "D3D12 video encode on this device uses texture array mode.\n");
     }
 
     desc = av_pix_fmt_desc_get(base_ctx->input_frames->sw_format);
-- 
2.45.2.windows.1