[FFmpeg-devel] [PATCH 13/13] avformat/mov: add support for Immersive Audio Model and Formats in ISOBMFF

Mon Nov 27 20:43:57 EET 2023

Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavformat/Makefile |   3 +-
 libavformat/isom.h   |   6 +
 libavformat/mov.c    | 290 +++++++++++++++++++++++++++++++++++++++----
 3 files changed, 272 insertions(+), 27 deletions(-)

diff --git a/libavformat/Makefile b/libavformat/Makefile
index 521bf5fef6..0272311828 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -364,7 +364,8 @@ OBJS-$(CONFIG_MMF_MUXER)                 += mmf.o rawenc.o
 OBJS-$(CONFIG_MODS_DEMUXER)              += mods.o
 OBJS-$(CONFIG_MOFLEX_DEMUXER)            += moflex.o
 OBJS-$(CONFIG_MOV_DEMUXER)               += mov.o mov_chan.o mov_esds.o \
-                                            qtpalette.o replaygain.o dovi_isom.o
+                                            qtpalette.o replaygain.o dovi_isom.o \
+                                            iamf.o
 OBJS-$(CONFIG_MOV_MUXER)                 += movenc.o av1.o avc.o hevc.o vpcc.o \
                                             movenchint.o mov_chan.o rtp.o \
                                             movenccenc.o movenc_ttml.o rawutils.o \
diff --git a/libavformat/isom.h b/libavformat/isom.h
index 3d375d7a46..32d42490b5 100644
--- a/libavformat/isom.h
+++ b/libavformat/isom.h
@@ -33,6 +33,7 @@
 #include "libavutil/stereo3d.h"
 
 #include "avio.h"
+#include "iamf.h"
 #include "internal.h"
 #include "dv.h"
 
@@ -166,6 +167,7 @@ typedef struct MOVIndexRange {
 typedef struct MOVStreamContext {
     AVIOContext *pb;
     int pb_is_copied;
+    int id;               ///< AVStream id
     int ffindex;          ///< AVStream index
     int next_chunk;
     unsigned int chunk_count;
@@ -260,6 +262,10 @@ typedef struct MOVStreamContext {
         AVEncryptionInfo *default_encrypted_sample;
         MOVEncryptionIndex *encryption_index;
     } cenc;
+
+    IAMFContext *iamf;
+    uint8_t *iamf_descriptors;
+    int iamf_descriptors_size;
 } MOVStreamContext;
 
 typedef struct MOVContext {
diff --git a/libavformat/mov.c b/libavformat/mov.c
index d1f214a441..11c68a2f6e 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -59,6 +59,7 @@
 #include "internal.h"
 #include "avio_internal.h"
 #include "demux.h"
+#include "iamf.h"
 #include "dovi_isom.h"
 #include "riff.h"
 #include "isom.h"
@@ -851,6 +852,163 @@ static int mov_read_dac3(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     return 0;
 }
 
+static int mov_read_iamf(MOVContext *c, AVIOContext *pb, int64_t size)
+{
+    AVStream *st;
+    MOVStreamContext *sc;
+    FFIOContext b;
+    AVIOContext *descriptor_pb;
+    AVDictionary *metadata;
+    IAMFContext *iamf;
+    char args[32];
+    int64_t start_time, duration;
+    int nb_frames, disposition;
+    int ret;
+
+    if ((int)size != size)
+        return AVERROR(ENOMEM);
+
+    if (c->fc->nb_streams < 1)
+        return 0;
+
+    st = c->fc->streams[c->fc->nb_streams - 1];
+    sc = mov_get_stream_context(st);
+
+    metadata = st->metadata;
+    st->metadata = NULL;
+    start_time = st->start_time;
+    nb_frames = st->nb_frames;
+    duration = st->duration;
+    disposition = st->disposition;
+
+    iamf = sc->iamf = av_mallocz(sizeof(*iamf));
+    if (!iamf) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    sc->iamf_descriptors = av_malloc(size);
+    if (!sc->iamf_descriptors) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    sc->iamf_descriptors_size = size;
+    ret = avio_read(pb, sc->iamf_descriptors, size);
+    if (ret != size) {
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    ffio_init_context(&b, sc->iamf_descriptors, size, 0, NULL, NULL, NULL, NULL);
+    descriptor_pb = &b.pub;
+
+    ret = ff_iamfdec_read_descriptors(iamf, descriptor_pb, size, c->fc);
+    if (ret < 0)
+        goto fail;
+
+    for (int i = 0; i < iamf->nb_audio_elements; i++) {
+        IAMFAudioElement *audio_element = &iamf->audio_elements[i];
+        AVStreamGroup *stg = avformat_stream_group_create(c->fc, AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT, NULL);
+
+        if (!stg) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        stg->id = audio_element->audio_element_id;
+        stg->params.iamf_audio_element = audio_element->element;
+        audio_element->element = NULL;
+
+        for (int j = 0; j < audio_element->nb_substreams; j++) {
+            IAMFSubStream *substream = &audio_element->substreams[j];
+            AVStream *stream;
+
+            if (!i && !j)
+                stream = st;
+            else
+                stream = avformat_new_stream(c->fc, NULL);
+            if (!stream) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            stream->start_time = start_time;
+            stream->nb_frames = nb_frames;
+            stream->duration = duration;
+            stream->disposition = disposition;
+            if (stream != st && !(stream->priv_data = av_buffer_ref(st->priv_data))) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            ret = avcodec_parameters_copy(stream->codecpar, substream->codecpar);
+            if (ret < 0)
+                goto fail;
+
+            stream->id = substream->audio_substream_id;
+
+            avpriv_set_pts_info(st, 64, 1, sc->time_scale);
+
+            ret = avformat_stream_group_add_stream(stg, stream);
+            if (ret < 0)
+                goto fail;
+        }
+
+        ret = av_dict_copy(&stg->metadata, metadata, 0);
+        if (ret < 0)
+            goto fail;
+    }
+
+    for (int i = 0; i < iamf->nb_mix_presentations; i++) {
+        IAMFMixPresentation *mix_presentation = &iamf->mix_presentations[i];
+        const AVIAMFMixPresentation *mix = mix_presentation->mix;
+        AVStreamGroup *stg = avformat_stream_group_create(c->fc, AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION, NULL);
+
+        if (!stg)
+            goto fail;
+
+        stg->id = mix_presentation->mix_presentation_id;
+        stg->params.iamf_mix_presentation = mix_presentation->mix;
+        mix_presentation->mix = NULL;
+
+        for (int j = 0; j < mix->num_submixes; j++) {
+            const AVIAMFSubmix *submix = mix->submixes[j];
+
+            for (int k = 0; k < submix->num_elements; k++) {
+                const AVIAMFSubmixElement *submix_element = submix->elements[k];
+                const AVStreamGroup *audio_element = NULL;
+
+                for (int l = 0; l < c->fc->nb_stream_groups; l++)
+                    if (c->fc->stream_groups[l]->type == AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT &&
+                        c->fc->stream_groups[l]->id   == submix_element->audio_element_id) {
+                        audio_element = c->fc->stream_groups[l];
+                        break;
+                    }
+                av_assert0(audio_element);
+
+                for (int l = 0; l < audio_element->nb_streams; l++) {
+                    ret = avformat_stream_group_add_stream(stg, audio_element->streams[l]);
+                    if (ret < 0 && ret != AVERROR(EEXIST))
+                        goto fail;
+                }
+            }
+        }
+
+        ret = av_dict_copy(&stg->metadata, metadata, 0);
+        if (ret < 0)
+            goto fail;
+    }
+
+    snprintf(args, sizeof(args), "first_index=%d", st->index);
+
+    ret = ff_stream_add_bitstream_filter(st, "iamf_stream_split", args);
+fail:
+    av_dict_free(&metadata);
+
+    return ret;
+}
+
 static int mov_read_dec3(MOVContext *c, AVIOContext *pb, MOVAtom atom)
 {
     AVStream *st;
@@ -1393,7 +1551,7 @@ static int64_t get_frag_time(AVFormatContext *s, AVStream *dst_st,
     // If the stream is referenced by any sidx, limit the search
     // to fragments that referenced this stream in the sidx
     if (sc->has_sidx) {
-        frag_stream_info = get_frag_stream_info(frag_index, index, dst_st->id);
+        frag_stream_info = get_frag_stream_info(frag_index, index, sc->id);
         if (frag_stream_info->sidx_pts != AV_NOPTS_VALUE)
             return frag_stream_info->sidx_pts;
         if (frag_stream_info->first_tfra_pts != AV_NOPTS_VALUE)
@@ -1404,9 +1562,11 @@ static int64_t get_frag_time(AVFormatContext *s, AVStream *dst_st,
     for (i = 0; i < frag_index->item[index].nb_stream_info; i++) {
         AVStream *frag_stream = NULL;
         frag_stream_info = &frag_index->item[index].stream_info[i];
-        for (j = 0; j < s->nb_streams; j++)
-            if (s->streams[j]->id == frag_stream_info->id)
+        for (j = 0; j < s->nb_streams; j++) {
+            MOVStreamContext *sc2 = mov_get_stream_context(s->streams[j]);
+            if (sc2->id == frag_stream_info->id)
                 frag_stream = s->streams[j];
+        }
         if (!frag_stream) {
             av_log(s, AV_LOG_WARNING, "No stream matching sidx ID found.\n");
             continue;
@@ -1472,12 +1632,13 @@ static int update_frag_index(MOVContext *c, int64_t offset)
 
     for (i = 0; i < c->fc->nb_streams; i++) {
         // Avoid building frag index if streams lack track id.
-        if (c->fc->streams[i]->id < 0) {
+        MOVStreamContext *sc = mov_get_stream_context(c->fc->streams[i]);
+        if (sc->id < 0) {
             av_free(frag_stream_info);
             return AVERROR_INVALIDDATA;
         }
 
-        frag_stream_info[i].id = c->fc->streams[i]->id;
+        frag_stream_info[i].id = sc->id;
         frag_stream_info[i].sidx_pts = AV_NOPTS_VALUE;
         frag_stream_info[i].tfdt_dts = AV_NOPTS_VALUE;
         frag_stream_info[i].next_trun_dts = AV_NOPTS_VALUE;
@@ -2368,14 +2529,17 @@ static void mov_parse_stsd_video(MOVContext *c, AVIOContext *pb,
     }
 }
 
-static void mov_parse_stsd_audio(MOVContext *c, AVIOContext *pb,
-                                 AVStream *st, MOVStreamContext *sc)
+static int mov_parse_stsd_audio(MOVContext *c, AVIOContext *pb,
+                                 AVStream *st, MOVStreamContext *sc,
+                                 int64_t size)
 {
     int bits_per_sample, flags;
+    int64_t start_pos = avio_tell(pb);
     uint16_t version = avio_rb16(pb);
     uint32_t id = 0;
     AVDictionaryEntry *compatible_brands = av_dict_get(c->fc->metadata, "compatible_brands", NULL, AV_DICT_MATCH_CASE);
     int channel_count;
+    int ret;
 
     avio_rb16(pb); /* revision level */
     id = avio_rl32(pb); /* vendor */
@@ -2436,7 +2600,9 @@ static void mov_parse_stsd_audio(MOVContext *c, AVIOContext *pb,
             st->codecpar->codec_id = mov_codec_id(st, MKTAG('r','a','w',' '));
         else if (st->codecpar->bits_per_coded_sample == 16)
             st->codecpar->codec_id = mov_codec_id(st, MKTAG('t','w','o','s'));
-    }
+    } else if (sc->format == MKTAG('i','a','m','f'))
+        if ((ret = mov_read_iamf(c, pb, size - (avio_tell(pb) - start_pos))) < 0)
+            return ret;
 
     switch (st->codecpar->codec_id) {
     case AV_CODEC_ID_PCM_S8:
@@ -2483,6 +2649,8 @@ static void mov_parse_stsd_audio(MOVContext *c, AVIOContext *pb,
         st->codecpar->bits_per_coded_sample = bits_per_sample;
         sc->sample_size = (bits_per_sample >> 3) * st->codecpar->ch_layout.nb_channels;
     }
+
+    return 0;
 }
 
 static void mov_parse_stsd_subtitle(MOVContext *c, AVIOContext *pb,
@@ -2772,7 +2940,10 @@ int ff_mov_read_stsd_entries(MOVContext *c, AVIOContext *pb, int entries)
         if (st->codecpar->codec_type==AVMEDIA_TYPE_VIDEO) {
             mov_parse_stsd_video(c, pb, st, sc);
         } else if (st->codecpar->codec_type==AVMEDIA_TYPE_AUDIO) {
-            mov_parse_stsd_audio(c, pb, st, sc);
+            int ret = mov_parse_stsd_audio(c, pb, st, sc,
+                                           size - (avio_tell(pb) - start_pos));
+            if (ret < 0)
+                return ret;
             if (st->codecpar->sample_rate < 0) {
                 av_log(c->fc, AV_LOG_ERROR, "Invalid sample rate %d\n", st->codecpar->sample_rate);
                 return AVERROR_INVALIDDATA;
@@ -3261,7 +3432,7 @@ static int mov_read_stts(MOVContext *c, AVIOContext *pb, MOVAtom atom)
                "All samples in data stream index:id [%d:%d] have zero "
                "duration, stream set to be discarded by default. Override "
                "using AVStream->discard or -discard for ffmpeg command.\n",
-               st->index, st->id);
+               st->index, sc->id);
         st->discard = AVDISCARD_ALL;
     }
     sc->track_end = duration;
@@ -4641,6 +4812,50 @@ static void fix_timescale(MOVContext *c, MOVStreamContext *sc)
     }
 }
 
+static int mov_update_iamf_streams(MOVContext *c, const AVStream *st)
+{
+    const MOVStreamContext *sc = mov_get_stream_context(st);
+
+    for (int i = 0; i < sc->iamf->nb_audio_elements; i++) {
+        const AVStreamGroup *stg = NULL;
+
+        for (int j = 0; j < c->fc->nb_stream_groups; j++)
+            if (c->fc->stream_groups[j]->id == sc->iamf->audio_elements[i].audio_element_id)
+                stg = c->fc->stream_groups[j];
+        av_assert0(stg);
+
+        for (int j = 0; j < stg->nb_streams; j++) {
+            const FFStream *sti = cffstream(st);
+            AVStream *out = stg->streams[j];
+            FFStream *out_sti = ffstream(stg->streams[j]);
+
+            out->codecpar->bit_rate = 0;
+
+            if (out == st)
+                continue;
+
+            out->time_base           = st->time_base;
+            out->start_time          = st->start_time;
+            out->duration            = st->duration;
+            out->nb_frames           = st->nb_frames;
+            out->disposition         = st->disposition;
+            out->discard             = st->discard;
+
+            av_assert0(!out_sti->index_entries);
+            out_sti->index_entries = av_malloc(sti->index_entries_allocated_size);
+            if (!out_sti->index_entries)
+                return AVERROR(ENOMEM);
+
+            out_sti->index_entries_allocated_size = sti->index_entries_allocated_size;
+            out_sti->nb_index_entries = sti->nb_index_entries;
+            out_sti->skip_samples = sti->skip_samples;
+            memcpy(out_sti->index_entries, sti->index_entries, sti->index_entries_allocated_size);
+        }
+    }
+
+    return 0;
+}
+
 static int mov_read_trak(MOVContext *c, AVIOContext *pb, MOVAtom atom)
 {
     AVStream *st;
@@ -4715,6 +4930,12 @@ static int mov_read_trak(MOVContext *c, AVIOContext *pb, MOVAtom atom)
 
     mov_build_index(c, st);
 
+    if (sc->iamf) {
+        ret = mov_update_iamf_streams(c, st);
+        if (ret < 0)
+            return ret;
+    }
+
     if (sc->dref_id-1 < sc->drefs_count && sc->drefs[sc->dref_id-1].path) {
         MOVDref *dref = &sc->drefs[sc->dref_id - 1];
         if (c->enable_drefs) {
@@ -4955,6 +5176,7 @@ static int avif_add_stream(MOVContext *c, int item_id)
     st->priv_data = sc;
     st->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
     st->codecpar->codec_id = AV_CODEC_ID_AV1;
+    sc->id = st->id;
     sc->ffindex = st->index;
     c->trak_index = st->index;
     st->avg_frame_rate.num = st->avg_frame_rate.den = 1;
@@ -5069,6 +5291,7 @@ static int mov_read_tkhd(MOVContext *c, AVIOContext *pb, MOVAtom atom)
         avio_rb32(pb); /* modification time */
     }
     st->id = (int)avio_rb32(pb); /* track id (NOT 0 !)*/
+    sc->id = st->id;
     avio_rb32(pb); /* reserved */
 
     /* highlevel (considering edits) duration in movie timebase */
@@ -5243,7 +5466,8 @@ static int mov_read_tfdt(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     int64_t base_media_decode_time;
 
     for (i = 0; i < c->fc->nb_streams; i++) {
-        if (c->fc->streams[i]->id == frag->track_id) {
+        sc = mov_get_stream_context(c->fc->streams[i]);
+        if (sc->id == frag->track_id) {
             st = c->fc->streams[i];
             break;
         }
@@ -5252,7 +5476,6 @@ static int mov_read_tfdt(MOVContext *c, AVIOContext *pb, MOVAtom atom)
         av_log(c->fc, AV_LOG_WARNING, "could not find corresponding track id %u\n", frag->track_id);
         return 0;
     }
-    sc = mov_get_stream_context(st);
     if (sc->pseudo_stream_id + 1 != frag->stsd_id && sc->pseudo_stream_id != -1)
         return 0;
     version = avio_r8(pb);
@@ -5296,7 +5519,8 @@ static int mov_read_trun(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     }
 
     for (i = 0; i < c->fc->nb_streams; i++) {
-        if (c->fc->streams[i]->id == frag->track_id) {
+        sc = mov_get_stream_context(c->fc->streams[i]);
+        if (sc->id == frag->track_id) {
             st = c->fc->streams[i];
             sti = ffstream(st);
             break;
@@ -5306,7 +5530,6 @@ static int mov_read_trun(MOVContext *c, AVIOContext *pb, MOVAtom atom)
         av_log(c->fc, AV_LOG_WARNING, "could not find corresponding track id %u\n", frag->track_id);
         return 0;
     }
-    sc = mov_get_stream_context(st);
     if (sc->pseudo_stream_id+1 != frag->stsd_id && sc->pseudo_stream_id != -1)
         return 0;
 
@@ -5599,7 +5822,8 @@ static int mov_read_sidx(MOVContext *c, AVIOContext *pb, MOVAtom atom)
 
     track_id = avio_rb32(pb); // Reference ID
     for (i = 0; i < c->fc->nb_streams; i++) {
-        if (c->fc->streams[i]->id == track_id) {
+        sc = mov_get_stream_context(c->fc->streams[i]);
+        if (sc->id == track_id) {
             st = c->fc->streams[i];
             break;
         }
@@ -5609,8 +5833,6 @@ static int mov_read_sidx(MOVContext *c, AVIOContext *pb, MOVAtom atom)
         return 0;
     }
 
-    sc = mov_get_stream_context(st);
-
     timescale = av_make_q(1, avio_rb32(pb));
 
     if (timescale.den <= 0) {
@@ -6491,14 +6713,14 @@ static int get_current_encryption_info(MOVContext *c, MOVEncryptionIndex **encry
     frag_stream_info = get_current_frag_stream_info(&c->frag_index);
     if (frag_stream_info) {
         for (i = 0; i < c->fc->nb_streams; i++) {
-            if (c->fc->streams[i]->id == frag_stream_info->id) {
+            *sc = mov_get_stream_context(c->fc->streams[i]);
+            if ((*sc)->id == frag_stream_info->id) {
               st = c->fc->streams[i];
               break;
             }
         }
         if (i == c->fc->nb_streams)
             return 0;
-        *sc = mov_get_stream_context(st);
 
         if (!frag_stream_info->encryption_index) {
             // If this stream isn't encrypted, don't create the index.
@@ -7435,7 +7657,7 @@ static int cenc_filter(MOVContext *mov, AVStream* st, MOVStreamContext *sc, AVPa
     AVEncryptionInfo *encrypted_sample;
     int encrypted_index, ret;
 
-    frag_stream_info = get_frag_stream_info_from_pkt(&mov->frag_index, pkt, st->id);
+    frag_stream_info = get_frag_stream_info_from_pkt(&mov->frag_index, pkt, sc->id);
     encrypted_index = current_index;
     encryption_index = NULL;
     if (frag_stream_info) {
@@ -8212,18 +8434,19 @@ static void mov_read_chapters(AVFormatContext *s)
         AVStream *st = NULL;
         FFStream *sti = NULL;
         chapter_track = mov->chapter_tracks[j];
-        for (i = 0; i < s->nb_streams; i++)
-            if (s->streams[i]->id == chapter_track) {
+        for (i = 0; i < s->nb_streams; i++) {
+            sc = mov_get_stream_context(s->streams[i]);
+            if (sc->id == chapter_track) {
                 st = s->streams[i];
                 break;
             }
+        }
         if (!st) {
             av_log(s, AV_LOG_ERROR, "Referenced QT chapter track not found\n");
             continue;
         }
         sti = ffstream(st);
 
-        sc = mov_get_stream_context(st);
         cur_pos = avio_tell(sc->pb);
 
         if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
@@ -8444,6 +8667,11 @@ static void mov_free_stream_context(void *opaque, uint8_t *data)
     av_freep(&sc->spherical);
     av_freep(&sc->mastering);
     av_freep(&sc->coll);
+
+    ff_iamf_uninit_context(sc->iamf);
+    av_freep(&sc->iamf);
+    av_freep(&sc->iamf_descriptors);
+    sc->iamf_descriptors_size = 0;
 }
 
 static int mov_read_close(AVFormatContext *s)
@@ -8682,9 +8910,11 @@ static int mov_read_header(AVFormatContext *s)
             AVDictionaryEntry *tcr;
             int tmcd_st_id = -1;
 
-            for (j = 0; j < s->nb_streams; j++)
-                if (s->streams[j]->id == sc->timecode_track)
+            for (j = 0; j < s->nb_streams; j++) {
+                MOVStreamContext *sc2 = mov_get_stream_context(s->streams[j]);
+                if (sc2->id == sc->timecode_track)
                     tmcd_st_id = j;
+            }
 
             if (tmcd_st_id < 0 || tmcd_st_id == i)
                 continue;
@@ -8997,7 +9227,15 @@ static int mov_read_packet(AVFormatContext *s, AVPacket *pkt)
 
         if (st->codecpar->codec_id == AV_CODEC_ID_EIA_608 && sample->size > 8)
             ret = get_eia608_packet(sc->pb, pkt, sample->size);
-        else
+        else if (sc->iamf_descriptors_size) {
+            ret = av_new_packet(pkt, sc->iamf_descriptors_size);
+            if (ret < 0)
+                return ret;
+            pkt->pos  = avio_tell(sc->pb);
+            memcpy(pkt->data, sc->iamf_descriptors, sc->iamf_descriptors_size);
+            sc->iamf_descriptors_size = 0;
+            ret = av_append_packet(sc->pb, pkt, sample->size);
+        } else
             ret = av_get_packet(sc->pb, pkt, sample->size);
         if (ret < 0) {
             if (should_retry(sc->pb, ret)) {
-- 
2.42.1