[FFmpeg-devel] [PATCH v2] avformat/mov: add option max_stts_delta

Tue Nov 23 15:11:06 EET 2021

Very high stts sample deltas may occasionally be intended but usually
they are written in error or used to store a negative value for dts correction
when treated as signed 32-bit integers.

This option lets the user set an upper limit, beyond which the delta
is clamped to 1. Negative values of under 1 second are used to adjust
dts.

Unit is the track time scale. Default is INT_MAX which maintains current handling.
---
 doc/demuxers.texi  |  6 ++++++
 libavformat/isom.h |  1 +
 libavformat/mov.c  | 14 +++++++++-----
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/doc/demuxers.texi b/doc/demuxers.texi
index cab8a7072c..15078b9b1b 100644
--- a/doc/demuxers.texi
+++ b/doc/demuxers.texi
@@ -713,6 +713,12 @@ specify.
 
 @item decryption_key
 16-byte key, in hex, to decrypt files encrypted using ISO Common Encryption (CENC/AES-128 CTR; ISO/IEC 23001-7).
+
+ at item max_stts_delta
+The sample offsets stored in a track's stts box are 32-bit unsigned integers. However, very large values usually indicate
+a value written by error or a storage of a small negative value as a way to correct accumulated DTS delay.
+Range is 0 to UINT_MAX. Default is INT_MAX.
+
 @end table
 
 @subsection Audible AAX
diff --git a/libavformat/isom.h b/libavformat/isom.h
index ef8f19b18c..625dea8421 100644
--- a/libavformat/isom.h
+++ b/libavformat/isom.h
@@ -305,6 +305,7 @@ typedef struct MOVContext {
     int32_t movie_display_matrix[3][3]; ///< display matrix from mvhd
     int have_read_mfra_size;
     uint32_t mfra_size;
+    uint32_t max_stts_delta;
 } MOVContext;
 
 int ff_mp4_read_descr_len(AVIOContext *pb);
diff --git a/libavformat/mov.c b/libavformat/mov.c
index 451cb78bbf..bbda07ac42 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -3965,14 +3965,17 @@ static void mov_build_index(MOVContext *mov, AVStream *st)
                 current_offset += sample_size;
                 stream_size += sample_size;
 
-                /* A negative sample duration is invalid based on the spec,
-                 * but some samples need it to correct the DTS. */
-                if (sc->stts_data[stts_index].duration < 0) {
+                /* STTS sample offsets are uint32 but some files store it as int32
+                 * with negative values used to correct DTS delays.
+                   There may be abnormally large values as well. */
+                if (sc->stts_data[stts_index].duration > mov->max_stts_delta) {
+                    // assume high delta is a negative correction if less than 1 second
+                    int32_t delta_magnitude = *((int32_t *)&sc->stts_data[stts_index].duration);
                     av_log(mov->fc, AV_LOG_WARNING,
-                           "Invalid SampleDelta %d in STTS, at %d st:%d\n",
+                           "Correcting too large SampleDelta %u in STTS, at %d st:%d.\n",
                            sc->stts_data[stts_index].duration, stts_index,
                            st->index);
-                    dts_correction += sc->stts_data[stts_index].duration - 1;
+                    dts_correction += (delta_magnitude < 0 && FFABS(delta_magnitude) < sc->time_scale ? delta_magnitude - 1 : 0);
                     sc->stts_data[stts_index].duration = 1;
                 }
                 current_dts += sc->stts_data[stts_index].duration;
@@ -8566,6 +8569,7 @@ static const AVOption mov_options[] = {
     { "decryption_key", "The media decryption key (hex)", OFFSET(decryption_key), AV_OPT_TYPE_BINARY, .flags = AV_OPT_FLAG_DECODING_PARAM },
     { "enable_drefs", "Enable external track support.", OFFSET(enable_drefs), AV_OPT_TYPE_BOOL,
         {.i64 = 0}, 0, 1, FLAGS },
+    { "max_stts_delta", "treat offsets above this value as invalid", OFFSET(max_stts_delta), AV_OPT_TYPE_INT, {.i64 = INT_MAX}, 0, UINT_MAX, .flags = AV_OPT_FLAG_DECODING_PARAM },
 
     { NULL },
 };
-- 
2.33.0