[FFmpeg-devel] [PATCH] avcodec/cuvid: Add support for P010 as an output surface format

Mon Nov 21 13:55:36 EET 2016

On Sat, 19 Nov 2016 17:18:08 -0800
Philip Langdale <philipl at overt.org> wrote:

> The nvidia 375.xx driver introduces support for P016 output surfaces,
> for 10bit and 12bit HEVC content (it's also the first driver to support
> hardware decoding of 12bit content).
> 
> Technically, we don't support P016, but in practice I don't think we
> zero-out the extra bits in P010 so it can be used to carry the data.
> 
> This change introduces cuvid decoder support for P010 output for
> output to hardware and system memory surfaces. For simplicity, it
> does not maintain the previous ability to output NV12 for > 8 bit
> input video - the user will need to update their driver to decode
> such videos.
> 
> After this change, both cuvid and nvenc support P010, but the
> ffmpeg_cuvid transcoding logic will need more work to connect the
> two together. Similarly, the scale_npp filter still only works with
> 8bit surfaces.
> 
> Signed-off-by: Philip Langdale <philipl at overt.org>
> ---
>  compat/cuda/dynlink_cuviddec.h |  3 ++-
>  libavcodec/cuvid.c             | 58 +++++++++++++++++++++++++++++++-----------
>  libavutil/hwcontext_cuda.c     | 11 +++++++-
>  3 files changed, 55 insertions(+), 17 deletions(-)
> 
> diff --git a/compat/cuda/dynlink_cuviddec.h b/compat/cuda/dynlink_cuviddec.h
> index 17207bc..9ff2741 100644
> --- a/compat/cuda/dynlink_cuviddec.h
> +++ b/compat/cuda/dynlink_cuviddec.h
> @@ -83,7 +83,8 @@ typedef enum cudaVideoCodec_enum {
>   * Video Surface Formats Enums
>   */
>  typedef enum cudaVideoSurfaceFormat_enum {
> -    cudaVideoSurfaceFormat_NV12=0       /**< NV12 (currently the only supported output format)  */
> +    cudaVideoSurfaceFormat_NV12=0,      /**< NV12  */
> +    cudaVideoSurfaceFormat_P016=1       /**< P016  */
>  } cudaVideoSurfaceFormat;
>  
>  /*!
> diff --git a/libavcodec/cuvid.c b/libavcodec/cuvid.c
> index c3e831a..34b0734 100644
> --- a/libavcodec/cuvid.c
> +++ b/libavcodec/cuvid.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/fifo.h"
>  #include "libavutil/log.h"
>  #include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
>  
>  #include "avcodec.h"
>  #include "internal.h"
> @@ -103,11 +104,35 @@ static int CUDAAPI cuvid_handle_video_sequence(void *opaque, CUVIDEOFORMAT* form
>      CuvidContext *ctx = avctx->priv_data;
>      AVHWFramesContext *hwframe_ctx = (AVHWFramesContext*)ctx->hwframe->data;
>      CUVIDDECODECREATEINFO cuinfo;
> +    int surface_fmt;
> +
> +    enum AVPixelFormat pix_fmts_nv12[3] = { AV_PIX_FMT_CUDA,
> +                                            AV_PIX_FMT_NV12,
> +                                            AV_PIX_FMT_NONE };
> +
> +    enum AVPixelFormat pix_fmts_p010[3] = { AV_PIX_FMT_CUDA,
> +                                            AV_PIX_FMT_P010,
> +                                            AV_PIX_FMT_NONE };
>  
>      av_log(avctx, AV_LOG_TRACE, "pfnSequenceCallback, progressive_sequence=%d\n", format->progressive_sequence);
>  
>      ctx->internal_error = 0;
>  
> +    surface_fmt = ff_get_format(avctx, format->bit_depth_luma_minus8 > 0 ?
> +                                pix_fmts_p010 : pix_fmts_nv12);
> +    if (surface_fmt < 0) {
> +        av_log(avctx, AV_LOG_ERROR, "ff_get_format failed: %d\n", surface_fmt);
> +        ctx->internal_error = AVERROR(EINVAL);
> +        return 0;
> +    }
> +
> +    av_log(avctx, AV_LOG_VERBOSE, "Formats: Original: %s | HW: %s | SW: %s\n",
> +           av_get_pix_fmt_name(avctx->pix_fmt),
> +           av_get_pix_fmt_name(surface_fmt),
> +           av_get_pix_fmt_name(avctx->sw_pix_fmt));
> +
> +    avctx->pix_fmt = surface_fmt;
> +
>      avctx->width = format->display_area.right;
>      avctx->height = format->display_area.bottom;
>  
> @@ -156,7 +181,7 @@ static int CUDAAPI cuvid_handle_video_sequence(void *opaque, CUVIDEOFORMAT* form
>              hwframe_ctx->width < avctx->width ||
>              hwframe_ctx->height < avctx->height ||
>              hwframe_ctx->format != AV_PIX_FMT_CUDA ||
> -            hwframe_ctx->sw_format != AV_PIX_FMT_NV12)) {
> +            hwframe_ctx->sw_format != avctx->sw_pix_fmt)) {
>          av_log(avctx, AV_LOG_ERROR, "AVHWFramesContext is already initialized with incompatible parameters\n");
>          ctx->internal_error = AVERROR(EINVAL);
>          return 0;
> @@ -177,7 +202,19 @@ static int CUDAAPI cuvid_handle_video_sequence(void *opaque, CUVIDEOFORMAT* form
>  
>      cuinfo.CodecType = ctx->codec_type = format->codec;
>      cuinfo.ChromaFormat = format->chroma_format;
> -    cuinfo.OutputFormat = cudaVideoSurfaceFormat_NV12;
> +
> +    switch (avctx->sw_pix_fmt) {
> +    case AV_PIX_FMT_NV12:
> +        cuinfo.OutputFormat = cudaVideoSurfaceFormat_NV12;
> +        break;
> +    case AV_PIX_FMT_P010:
> +        cuinfo.OutputFormat = cudaVideoSurfaceFormat_P016;
> +        break;
> +    default:
> +        av_log(avctx, AV_LOG_ERROR, "Output formats other than NV12 or P010 are not supported\n");
> +        ctx->internal_error = AVERROR(EINVAL);
> +        return 0;
> +    }
>  
>      cuinfo.ulWidth = avctx->coded_width;
>      cuinfo.ulHeight = avctx->coded_height;
> @@ -209,7 +246,7 @@ static int CUDAAPI cuvid_handle_video_sequence(void *opaque, CUVIDEOFORMAT* form
>  
>      if (!hwframe_ctx->pool) {
>          hwframe_ctx->format = AV_PIX_FMT_CUDA;
> -        hwframe_ctx->sw_format = AV_PIX_FMT_NV12;
> +        hwframe_ctx->sw_format = avctx->sw_pix_fmt;
>          hwframe_ctx->width = avctx->width;
>          hwframe_ctx->height = avctx->height;
>  
> @@ -417,7 +454,8 @@ static int cuvid_output_frame(AVCodecContext *avctx, AVFrame *frame)
>  
>                  offset += avctx->coded_height;
>              }
> -        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
> +        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12 ||
> +                   avctx->pix_fmt == AV_PIX_FMT_P010) {
>              AVFrame *tmp_frame = av_frame_alloc();
>              if (!tmp_frame) {
>                  av_log(avctx, AV_LOG_ERROR, "av_frame_alloc failed\n");
> @@ -615,17 +653,6 @@ static av_cold int cuvid_decode_init(AVCodecContext *avctx)
>      const AVBitStreamFilter *bsf;
>      int ret = 0;
>  
> -    enum AVPixelFormat pix_fmts[3] = { AV_PIX_FMT_CUDA,
> -                                       AV_PIX_FMT_NV12,
> -                                       AV_PIX_FMT_NONE };
> -
> -    ret = ff_get_format(avctx, pix_fmts);
> -    if (ret < 0) {
> -        av_log(avctx, AV_LOG_ERROR, "ff_get_format failed: %d\n", ret);
> -        return ret;
> -    }
> -    avctx->pix_fmt = ret;
> -
>      ret = cuvid_load_functions(&ctx->cvdl);
>      if (ret < 0) {
>          av_log(avctx, AV_LOG_ERROR, "Failed loading nvcuvid.\n");
> @@ -899,6 +926,7 @@ static const AVOption options[] = {
>          .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
>          .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, \
>                                                          AV_PIX_FMT_NV12, \
> +                                                        AV_PIX_FMT_P010, \
>                                                          AV_PIX_FMT_NONE }, \
>      };
>  
> diff --git a/libavutil/hwcontext_cuda.c b/libavutil/hwcontext_cuda.c
> index 30de299..e413aa8 100644
> --- a/libavutil/hwcontext_cuda.c
> +++ b/libavutil/hwcontext_cuda.c
> @@ -35,6 +35,7 @@ static const enum AVPixelFormat supported_formats[] = {
>      AV_PIX_FMT_NV12,
>      AV_PIX_FMT_YUV420P,
>      AV_PIX_FMT_YUV444P,
> +    AV_PIX_FMT_P010,
>  };
>  
>  static void cuda_buffer_free(void *opaque, uint8_t *data)
> @@ -111,6 +112,7 @@ static int cuda_frames_init(AVHWFramesContext *ctx)
>              size = aligned_width * ctx->height * 3 / 2;
>              break;
>          case AV_PIX_FMT_YUV444P:
> +        case AV_PIX_FMT_P010:
>              size = aligned_width * ctx->height * 3;
>              break;
>          }
> @@ -125,7 +127,13 @@ static int cuda_frames_init(AVHWFramesContext *ctx)
>  
>  static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
>  {
> -    int aligned_width = FFALIGN(ctx->width, CUDA_FRAME_ALIGNMENT);
> +    int aligned_width;
> +    int width_in_bytes = ctx->width;
> +
> +    if (ctx->sw_format == AV_PIX_FMT_P010) {
> +       width_in_bytes *= 2;
> +    }
> +    aligned_width = FFALIGN(width_in_bytes, CUDA_FRAME_ALIGNMENT);
>  
>      frame->buf[0] = av_buffer_pool_get(ctx->pool);
>      if (!frame->buf[0])
> @@ -133,6 +141,7 @@ static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
>  
>      switch (ctx->sw_format) {
>      case AV_PIX_FMT_NV12:
> +    case AV_PIX_FMT_P010:
>          frame->data[0]     = frame->buf[0]->data;
>          frame->data[1]     = frame->data[0] + aligned_width * ctx->height;
>          frame->linesize[0] = aligned_width;

I think it would be better to add a P016 pixfmt if the decoder can
output data that has the LSBs set for the ones that are normally 0 in
P010.