[FFmpeg-devel] [PATCH v2] Add SRV3 decoder/demuxer

Thu Dec 26 00:29:24 EET 2024

This commit adds preliminary support for decoding the SRV3 subtitle format.
SRV3 is the internal format YouTube uses for their captions. Supporting it
in ffmpeg allows video players to play a significant subset of SRV3
mostly correctly by converting it to ASS.
Currently the following features are unsupported:
- Vertical text
- Scrolling text
- Ruby text
- Background box support is janky
These issues are mostly due to limitations of the ASS format.

Signed-off-by: Hubert Głuchowski <fishhh at fishhh.dev>
---
Hi guys, the ffmpeg-devel mailing list seems very preoccupied
arguing about authoritarianism, and as entertaining as that is to watch
externally, it may have contributed to no one looking at my previous
patch.

In v2 I changed srv3_clean_segment_text to make it less brain-dead
and did stuff I initially missed from the "submitting a patch"
checklist (changelog, supported formats table etc.).

 Changelog                 |   1 +
 MAINTAINERS               |   2 +
 configure                 |   2 +
 doc/general_contents.texi |   1 +
 libavcodec/Makefile       |   1 +
 libavcodec/allcodecs.c    |   1 +
 libavcodec/codec_desc.c   |   7 +
 libavcodec/codec_id.h     |   1 +
 libavcodec/packet.c       |   2 +
 libavcodec/packet.h       |  12 +
 libavcodec/srv3dec.c      | 260 +++++++++++++++++++
 libavformat/Makefile      |   1 +
 libavformat/allformats.c  |   1 +
 libavformat/srv3.h        |  95 +++++++
 libavformat/srv3dec.c     | 520 ++++++++++++++++++++++++++++++++++++++
 libavformat/version.h     |   4 +-
 16 files changed, 909 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/srv3dec.c
 create mode 100644 libavformat/srv3.h
 create mode 100644 libavformat/srv3dec.c

diff --git a/Changelog b/Changelog
index 779bb2c8bf..0cd0ad8a2c 100644
--- a/Changelog
+++ b/Changelog
@@ -8,6 +8,7 @@ version <next>:
 - OpenMAX encoders deprecated
 - libx265 alpha layer encoding
 - ADPCM IMA Xbox decoder
+- SRV3 subtitle decoder
 
 version 7.1:
 - Raw Captions with Time (RCWT) closed caption demuxer
diff --git a/MAINTAINERS b/MAINTAINERS
index 9714581c6b..44cb0de218 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -153,6 +153,7 @@ Codecs:
   amfenc*                               Dmitrii Ovchinnikov
   aptx.c                                Aurelien Jacobs
   ass*                                  Aurelien Jacobs
+  srv3*                                 Hubert Głuchowski (CC <fishhh at fishhh.dev>)
   asv*                                  Michael Niedermayer
   atrac3plus*                           Maxim Poliakovski
   audiotoolbox*                         rcombs
@@ -376,6 +377,7 @@ Muxers/Demuxers:
   argo_brp.c                            Zane van Iperen
   argo_cvg.c                            Zane van Iperen
   ass*                                  Aurelien Jacobs
+  srv3*                                 Hubert Głuchowski (CC <fishhh at fishhh.dev>)
   astenc.c                              James Almer
   avi*                                  Michael Niedermayer
   avisynth.c                            Stephen Hutchinson
diff --git a/configure b/configure
index 0a7ce31e09..e5574c780d 100755
--- a/configure
+++ b/configure
@@ -3727,6 +3727,8 @@ wtv_demuxer_select="mpegts_demuxer riffdec"
 wtv_muxer_select="mpegts_muxer riffenc"
 xmv_demuxer_select="riffdec"
 xwma_demuxer_select="riffdec"
+srv3_demuxer_deps="libxml2"
+srv3_demuxer_select="srv3dec"
 
 # indevs / outdevs
 android_camera_indev_deps="android camera2ndk mediandk pthreads"
diff --git a/doc/general_contents.texi b/doc/general_contents.texi
index 5faf89815b..c182568061 100644
--- a/doc/general_contents.texi
+++ b/doc/general_contents.texi
@@ -1450,6 +1450,7 @@ performance on systems without hardware floating point support).
 @item RealText         @tab   @tab X @tab   @tab X
 @item SAMI             @tab   @tab X @tab   @tab X
 @item Spruce format (STL) @tab   @tab X @tab   @tab X
+ at item SRV3             @tab   @tab X @tab   @tab X
 @item SSA/ASS          @tab X @tab X @tab X @tab X
 @item SubRip (SRT)     @tab X @tab X @tab X @tab X
 @item SubViewer v1     @tab   @tab X @tab   @tab X
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index a3ef11a258..1334d62d10 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -707,6 +707,7 @@ OBJS-$(CONFIG_SP5X_DECODER)            += sp5xdec.o
 OBJS-$(CONFIG_SRGC_DECODER)            += mscc.o
 OBJS-$(CONFIG_SRT_DECODER)             += srtdec.o ass.o htmlsubtitles.o
 OBJS-$(CONFIG_SRT_ENCODER)             += srtenc.o ass_split.o
+OBJS-$(CONFIG_SRV3_DECODER)            += srv3dec.o ass.o
 OBJS-$(CONFIG_STL_DECODER)             += textdec.o ass.o
 OBJS-$(CONFIG_SUBRIP_DECODER)          += srtdec.o ass.o htmlsubtitles.o
 OBJS-$(CONFIG_SUBRIP_ENCODER)          += srtenc.o ass_split.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 433a2265a3..32896ae1f6 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -739,6 +739,7 @@ extern const FFCodec ff_webvtt_encoder;
 extern const FFCodec ff_webvtt_decoder;
 extern const FFCodec ff_xsub_encoder;
 extern const FFCodec ff_xsub_decoder;
+extern const FFCodec ff_srv3_decoder;
 
 /* external libraries */
 extern const FFCodec ff_aac_at_encoder;
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index d31dc432ff..3e6cf1f998 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -3641,6 +3641,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("ARIB STD-B24 caption"),
         .profiles  = NULL_IF_CONFIG_SMALL(ff_arib_caption_profiles),
     },
+    {
+        .id        = AV_CODEC_ID_SRV3,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "srv3",
+        .long_name = NULL_IF_CONFIG_SMALL("SRV3 subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
 
     /* other kind of codecs and pseudo-codecs */
     {
diff --git a/libavcodec/codec_id.h b/libavcodec/codec_id.h
index 0731d6cd69..df76d3c913 100644
--- a/libavcodec/codec_id.h
+++ b/libavcodec/codec_id.h
@@ -580,6 +580,7 @@ enum AVCodecID {
     AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
     AV_CODEC_ID_TTML,
     AV_CODEC_ID_ARIB_CAPTION,
+    AV_CODEC_ID_SRV3,
 
     /* other specific kind of codecs (generally used for attachments) */
     AV_CODEC_ID_FIRST_UNKNOWN = 0x18000,           ///< A dummy ID pointing at the start of various fake codecs.
diff --git a/libavcodec/packet.c b/libavcodec/packet.c
index 5104eb98b1..fa5f84e34a 100644
--- a/libavcodec/packet.c
+++ b/libavcodec/packet.c
@@ -288,6 +288,8 @@ const char *av_packet_side_data_name(enum AVPacketSideDataType type)
     case AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL:   return "Matroska BlockAdditional";
     case AV_PKT_DATA_WEBVTT_IDENTIFIER:          return "WebVTT ID";
     case AV_PKT_DATA_WEBVTT_SETTINGS:            return "WebVTT Settings";
+    case AV_PKT_DATA_SRV3_HEAD:                  return "SRV3 Head";
+    case AV_PKT_DATA_SRV3_EVENT:                 return "SRV3 Event metadata";
     case AV_PKT_DATA_METADATA_UPDATE:            return "Metadata Update";
     case AV_PKT_DATA_MPEGTS_STREAM_ID:           return "MPEGTS Stream ID";
     case AV_PKT_DATA_MASTERING_DISPLAY_METADATA: return "Mastering display metadata";
diff --git a/libavcodec/packet.h b/libavcodec/packet.h
index c1f1ad7b43..d3ccb97809 100644
--- a/libavcodec/packet.h
+++ b/libavcodec/packet.h
@@ -345,6 +345,18 @@ enum AVPacketSideDataType {
      */
     AV_PKT_DATA_LCEVC,
 
+    /**
+     * SRV3 subtitle header.
+     * Not part of public API, do not rely on its existance or layout.
+     */
+    AV_PKT_DATA_SRV3_HEAD,
+
+    /**
+     * SRV3 metadata associated with a single subtitle event.
+     * Not part of public API, do not rely on its existance or layout.
+     */
+    AV_PKT_DATA_SRV3_EVENT,
+
     /**
      * The number of side data types.
      * This is not part of the public API/ABI in the sense that it may
diff --git a/libavcodec/srv3dec.c b/libavcodec/srv3dec.c
new file mode 100644
index 0000000000..c0e7eab3f4
--- /dev/null
+++ b/libavcodec/srv3dec.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2024 Hubert Głuchowski
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SRV3/YTT subtitle decoder
+ * @see https://github.com/arcusmaximus/YTSubConverter
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "codec_internal.h"
+#include "libavformat/srv3.h"
+#include "libavutil/bprint.h"
+#include "version.h"
+
+const int PLAY_RES_X = 1280;
+const int PLAY_RES_Y = 720;
+const int BASE_FONT_SIZE = 38;
+
+// From https://github.com/arcusmaximus/YTSubConverter/blob/38fb2ab469f37e8f3a5a6a27adf91d9d0e81ea4f/YTSubConverter.Shared/Formats/YttDocument.cs#L1123
+static const char *srv3_font_style_to_font_name(int font_style) {
+    switch(font_style) {
+    case 1:
+        return "Courier New";
+    case 2:
+        return "Times New Roman";
+    case 3:
+        return "Lucida Console";
+    case 4:
+        return "Comic Sans Ms";
+    case 6:
+        return "Monotype Corsiva";
+    case 7:
+        return "Carrois Gothic Sc";
+    default:
+        return "Roboto";
+    };
+}
+
+static int srv3_point_to_ass_alignment(int point) {
+    if (point >= 6)
+        return point - 5;
+    else if (point < 3)
+        return point + 7;
+    return point + 1;
+}
+
+static int srv3_coord_to_ass(int coord, int max) {
+    return (2.0 + coord * 0.96) / 100.0 * max;
+}
+
+static float srv3_font_size_to_ass(int size) {
+    return BASE_FONT_SIZE * (1.0 + ((size / 100.0) - 1.0) / 4.0);
+}
+
+#define RGB2BGR(color) (((color) & 0x0000FF) << 16 | ((color) & 0x00FF00) | ((color) & 0xFF0000) >> 16)
+#define RGB2ASS(color, alpha) RGB2BGR(color) | ((0xFF - (alpha)) << 24)
+#define ASSBOOL(value) ((value) > 0) * -1
+
+static void srv3_style_segment(AVCodecContext *ctx, AVBPrint *buf, SRV3Segment *segment) {
+    av_bprintf(buf, "{\\rP%i}", segment->pen->id + 1);
+
+    if (segment->pen->background_alpha == 0) {
+        switch(segment->pen->edge_type) {
+        case SRV3_EDGE_HARD_SHADOW:
+            av_bprintf(buf, "{\\shad2}");
+            break;
+        /*
+         * I think falling back to a glow effect on soft shadow is better than just using a normal shadow.
+         * YTSubConverter doesn't agree with me on this and I'm not completely sure whether it's the right choice.
+         */
+        case SRV3_EDGE_SOFT_SHADOW:
+        case SRV3_EDGE_GLOW:
+            av_bprintf(buf, "{\\bord2\\blur3}");
+            break;
+        case SRV3_EDGE_BEVEL:
+            av_bprintf(buf, "{\\shad2}");
+            break;
+        case SRV3_EDGE_NONE:
+            break;
+        default:
+            av_log(ctx, AV_LOG_WARNING, "bug: Unhandled edge type %i in decoder\n", segment->pen->edge_type);
+            break;
+        }
+    } else if (segment->pen->edge_type) {
+        /*
+         * ASS doesn't support text shadows or outlines with BorderStyle 3.
+         * TODO: Add an option to enable BorderStyle 4 usage
+         */
+    }
+}
+
+static void srv3_process_text(AVBPrint *buf, const char *text, int count) {
+    for (int i = 0; i < count; ++i) {
+        if (text[i] == '\r')
+            continue;
+        else if (text[i] == '\n')
+            av_bprintf(buf, "\\N");
+        else
+            av_bprintf(buf, "%c", text[i]);
+    }
+}
+
+static void srv3_position_event(SRV3EventMeta *event, int *x, int *y, int *align) {
+    if (event->wp) {
+        *x = srv3_coord_to_ass(event->wp->x , PLAY_RES_X);
+        *y = srv3_coord_to_ass(event->wp->y, PLAY_RES_Y);
+        *align = srv3_point_to_ass_alignment(event->wp->point);
+    } else {
+        *x = srv3_coord_to_ass(50, PLAY_RES_X);
+        *y = srv3_coord_to_ass(100, PLAY_RES_Y);
+        *align = 2;
+    }
+}
+
+static void srv3_event_text_ass(AVCodecContext *ctx, AVBPrint *buf, const char *text, SRV3EventMeta *event)
+{
+    SRV3Segment *segment;
+    int x, y, alignment;
+
+    srv3_position_event(event, &x, &y, &alignment);
+    av_bprintf(buf, "{\\an%i\\pos(%i,%i)}", alignment, x, y);
+
+    for (segment = event->segments; segment; segment = segment->next) {
+        srv3_style_segment(ctx, buf, segment);
+        srv3_process_text(buf, text, segment->size);
+        text += segment->size;
+    }
+}
+
+static int srv3_decode_frame(AVCodecContext *avctx, AVSubtitle *sub,
+                             int *got_sub_ptr, const AVPacket *avpkt)
+{
+    int ret = 0;
+    FFASSDecoderContext *ctx = avctx->priv_data;
+    const char *text = avpkt->data;
+    SRV3EventMeta *event = (SRV3EventMeta*)av_packet_get_side_data(avpkt, AV_PKT_DATA_SRV3_EVENT, NULL);
+    AVBPrint buf;
+
+    if (!text || avpkt->size == 0)
+        return 0;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    srv3_event_text_ass(avctx, &buf, text, event);
+    if (av_bprint_is_complete(&buf))
+        ret = ff_ass_add_rect(sub, buf.str, ctx->readorder++, 0, NULL, NULL);
+    else
+        ret = AVERROR(ENOMEM);
+
+    av_bprint_finalize(&buf, NULL);
+
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static av_cold int srv3_decoder_init(AVCodecContext *avctx) {
+    int ret = 0;
+    AVBPrint header;
+    const AVPacketSideData *head_sd;
+    SRV3Pen *pen;
+
+    av_bprint_init(&header, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    av_bprintf(&header,
+               "[Script Info]\r\n"
+               "; Script generated by FFmpeg/Lavc%s\r\n"
+               "ScriptType: v4.00+\r\n"
+               "PlayResX: %i\r\n"
+               "PlayResY: %i\r\n"
+               "WrapStyle: 0\r\n"
+               "ScaledBorderAndShadow: yes\r\n"
+               "YCbCr Matrix: None\r\n"
+               "\r\n"
+               "[V4+ Styles]\r\n"
+               "Format: Name, "
+               "Fontname, Fontsize, "
+               "PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
+               "Bold, Italic, Underline, StrikeOut, "
+               "ScaleX, ScaleY, "
+               "Spacing, Angle, "
+               "BorderStyle, Outline, Shadow, "
+               "Alignment, MarginL, MarginR, MarginV, "
+               "Encoding\r\n",
+               !(avctx->flags & AV_CODEC_FLAG_BITEXACT) ? AV_STRINGIFY(LIBAVCODEC_VERSION) : "",
+               PLAY_RES_X, PLAY_RES_Y);
+
+    head_sd = av_packet_side_data_get(avctx->coded_side_data, avctx->nb_coded_side_data, AV_PKT_DATA_SRV3_HEAD);
+    if (head_sd) {
+        for (pen = ((SRV3Head*)head_sd->data)->pens; pen; pen = pen->next)
+            av_bprintf(&header,
+                       "Style: "
+                       "P%i,"                 /* Name */
+                       "%s,%f,"               /* Font{name,size} */
+                       "&H%x,&H0,&H%x,&H%x,"  /* {Primary,Secondary,Outline,Back}Colour */
+                       "%i,%i,0,0,"           /* Bold, Italic, Underline, StrikeOut */
+                       "100,100,"             /* Scale{X,Y} */
+                       "0,0,"                 /* Spacing, Angle */
+                       "%i,%i,0,"             /* BorderStyle, Outline, Shadow */
+                       "2,0,0,0,"             /* Alignment, Margin[LRV] */
+                       "1\r\n",               /* Encoding */
+                       pen->id + 1,
+                       srv3_font_style_to_font_name(pen->font_style), srv3_font_size_to_ass(pen->font_size),
+                       RGB2ASS(pen->foreground_color, pen->foreground_alpha),
+                       pen->background_alpha > 0
+                           ? RGB2ASS(pen->background_color, pen->background_alpha)
+                           : RGB2ASS(pen->edge_color, pen->foreground_alpha),
+                       pen->background_alpha > 0
+                           ? RGB2ASS(pen->background_color, pen->background_alpha)
+                           : RGB2ASS(pen->edge_color, pen->foreground_alpha),
+                       ASSBOOL(pen->attrs & SRV3_PEN_ATTR_BOLD), ASSBOOL(pen->attrs & SRV3_PEN_ATTR_ITALIC),
+                       pen->background_alpha > 0 ? 3 : (pen->edge_type > 0), pen->background_alpha > 0);
+    }
+
+    av_bprintf(&header,
+               "[Events]\r\n"
+               "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\r\n");
+
+    av_bprint_finalize(&header, (char**)&avctx->subtitle_header);
+    if (!avctx->subtitle_header) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+    avctx->subtitle_header_size = header.len;
+
+end:
+    av_bprint_finalize(&header, NULL);
+    return ret;
+}
+
+const FFCodec ff_srv3_decoder = {
+    .p.name         = "srv3",
+    CODEC_LONG_NAME("SRV3 subtitle"),
+    .p.type         = AVMEDIA_TYPE_SUBTITLE,
+    .p.id           = AV_CODEC_ID_SRV3,
+    FF_CODEC_DECODE_SUB_CB(srv3_decode_frame),
+    .init           = srv3_decoder_init,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavformat/Makefile b/libavformat/Makefile
index 074efc118a..6a9744d571 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -571,6 +571,7 @@ OBJS-$(CONFIG_SPEEX_MUXER)               += oggenc.o \
                                             vorbiscomment.o
 OBJS-$(CONFIG_SRT_DEMUXER)               += srtdec.o subtitles.o
 OBJS-$(CONFIG_SRT_MUXER)                 += srtenc.o
+OBJS-$(CONFIG_SRV3_DEMUXER)              += srv3dec.o subtitles.o
 OBJS-$(CONFIG_STL_DEMUXER)               += stldec.o subtitles.o
 OBJS-$(CONFIG_STR_DEMUXER)               += psxstr.o
 OBJS-$(CONFIG_STREAMHASH_MUXER)          += hashenc.o
diff --git a/libavformat/allformats.c b/libavformat/allformats.c
index 445f13f42a..f56eb34a90 100644
--- a/libavformat/allformats.c
+++ b/libavformat/allformats.c
@@ -451,6 +451,7 @@ extern const FFInputFormat  ff_spdif_demuxer;
 extern const FFOutputFormat ff_spdif_muxer;
 extern const FFInputFormat  ff_srt_demuxer;
 extern const FFOutputFormat ff_srt_muxer;
+extern const FFInputFormat  ff_srv3_demuxer;
 extern const FFInputFormat  ff_str_demuxer;
 extern const FFInputFormat  ff_stl_demuxer;
 extern const FFOutputFormat ff_streamhash_muxer;
diff --git a/libavformat/srv3.h b/libavformat/srv3.h
new file mode 100644
index 0000000000..45bf997654
--- /dev/null
+++ b/libavformat/srv3.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024 Hubert Głuchowski
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFORMAT_SRV3_H
+#define AVFORMAT_SRV3_H
+
+#include "avformat.h"
+#include "internal.h"
+
+enum SRV3PenAttrs {
+    SRV3_PEN_ATTR_ITALIC = 1,
+    SRV3_PEN_ATTR_BOLD = 2,
+};
+
+// https://github.com/arcusmaximus/YTSubConverter/blob/38fb2ab469f37e8f3a5a6a27adf91d9d0e81ea4f/YTSubConverter.Shared/Formats/YttDocument.cs#L1019C14-L1019C14
+enum SRV3EdgeType {
+    SRV3_EDGE_NONE = 0,
+    SRV3_EDGE_HARD_SHADOW = 1,
+    SRV3_EDGE_BEVEL = 2,
+    SRV3_EDGE_GLOW = 3,
+    SRV3_EDGE_SOFT_SHADOW = 4,
+};
+
+enum SRV3RubyPart {
+    SRV3_RUBY_NONE = 0,
+    SRV3_RUBY_BASE = 1,
+    SRV3_RUBY_PARENTHESIS = 2,
+    SRV3_RUBY_BEFORE = 4,
+    SRV3_RUBY_AFTER = 5,
+};
+
+typedef struct SRV3Pen {
+    int id;
+
+    int font_size, font_style;
+    int attrs;
+
+    int edge_type, edge_color;
+
+    int ruby_part;
+
+    int foreground_color, foreground_alpha;
+    int background_color, background_alpha;
+
+    struct SRV3Pen *next;
+} SRV3Pen;
+
+typedef struct SRV3WindowPos {
+    int id;
+
+    int point, x, y;
+
+    struct SRV3WindowPos *next;
+} SRV3WindowPos;
+
+typedef struct SRV3Head {
+    SRV3Pen *pens;
+} SRV3Head;
+
+typedef struct SRV3Segment {
+    int size;
+    SRV3Pen *pen;
+
+    /*
+     * The next segment in the same event.
+     */
+    struct SRV3Segment *next;
+} SRV3Segment;
+
+typedef struct SRV3EventMeta {
+    /*
+    * An ordered list of segments.
+    */
+    SRV3Segment *segments;
+    SRV3WindowPos *wp;
+} SRV3EventMeta;
+
+#endif // AVFORMAT_SRV3_H
diff --git a/libavformat/srv3dec.c b/libavformat/srv3dec.c
new file mode 100644
index 0000000000..59315c0308
--- /dev/null
+++ b/libavformat/srv3dec.c
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2024 Hubert Głuchowski
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SRV3/YTT subtitle demuxer
+ * This is a youtube specific subtitle format that utilizes XML.
+ * Because there is currently no official documentation some information about the format,
+ * some information was acquired by reading YTSubConverter code.
+ * @see https://github.com/arcusmaximus/YTSubConverter
+ */
+
+#include <libxml/parser.h>
+#include <libxml/tree.h>
+#include "srv3.h"
+#include "avformat.h"
+#include "demux.h"
+#include "internal.h"
+#include "subtitles.h"
+#include "libavutil/bprint.h"
+#include "libavutil/opt.h"
+#include "libavutil/mem.h"
+
+typedef struct SRV3GlobalSegments {
+    SRV3Segment *list;
+    struct SRV3GlobalSegments *next;
+} SRV3GlobalSegments;
+
+typedef struct SRV3Context {
+    const AVClass *class;
+    FFDemuxSubtitlesQueue q;
+    SRV3Pen *pens;
+    SRV3WindowPos *wps;
+    SRV3GlobalSegments *segments;
+} SRV3Context;
+
+static SRV3Pen srv3_default_pen = {
+    .id = -1,
+
+    .font_size = 100,
+    .font_style = 0,
+    .attrs = 0,
+
+    .edge_type = 0,
+    .edge_color = 0x020202,
+
+    .ruby_part = SRV3_RUBY_NONE,
+
+    .foreground_color = 0xFFFFFF,
+    .foreground_alpha = 254,
+    .background_color = 0x080808,
+    .background_alpha = 192,
+
+    .next = NULL
+};
+
+static void srv3_free_context_data(SRV3Context *ctx) {
+    void *next;
+
+#define FREE_LIST(type, list, until)                     \
+do {                                                                \
+    for (void *current = list; current && current != until; current = next) {  \
+        next = ((type*)current)->next;                              \
+        av_free(current);                                           \
+    }                                                               \
+} while(0)
+
+    FREE_LIST(SRV3Pen, ctx->pens, &srv3_default_pen);
+    FREE_LIST(SRV3WindowPos, ctx->wps, NULL);
+
+    for (SRV3GlobalSegments *segments = ctx->segments; segments; segments = next) {
+        FREE_LIST(SRV3Segment, segments->list, NULL);
+        next = segments->next;
+        av_free(segments);
+    }
+}
+
+static SRV3Pen *srv3_get_pen(SRV3Context *ctx, int id) {
+    for (SRV3Pen *pen = ctx->pens; pen; pen = pen->next)
+        if (pen->id == id)
+            return pen;
+    return NULL;
+}
+
+static int srv3_probe(const AVProbeData *p)
+{
+    if (strstr(p->buf, "<timedtext format=\"3\">"))
+        return AVPROBE_SCORE_MAX;
+
+    return 0;
+}
+
+static int srv3_parse_numeric_value(SRV3Context *ctx, const char *parent, const char *name, const char *value, int base, int *out, int min, int max)
+{
+    char *endptr;
+    long parsed;
+
+    parsed = strtol(value, &endptr, base);
+
+    if (*endptr != 0) {
+        av_log(ctx, AV_LOG_WARNING, "Failed to parse value \"%s\" of %s attribute %s as an integer\n", value, parent, name);
+        return AVERROR_INVALIDDATA;
+    } else if (parsed < min || parsed > max) {
+        av_log(ctx, AV_LOG_WARNING, "Value %li out of range for %s attribute %s ([%i, %i])\n", parsed, parent, name, min, max);
+        return AVERROR(ERANGE);
+    } else if(out) {
+        *out = parsed;
+        return 0;
+    } else return parsed;
+}
+
+static int srv3_parse_numeric_attr(SRV3Context *ctx, const char *parent, xmlAttrPtr attr, int *out, int min, int max)
+{
+    return srv3_parse_numeric_value(ctx, parent, attr->name, attr->children->content, 10, out, min, max) == 0;
+}
+
+static void srv3_parse_color_attr(SRV3Context *ctx, const char *parent, xmlAttrPtr attr, int *out)
+{
+    srv3_parse_numeric_value(ctx, parent, attr->name, attr->children->content + (*attr->children->content == '#'), 16, out, 0, 0xFFFFFF);
+}
+
+static int srv3_read_pen(SRV3Context *ctx, xmlNodePtr element)
+{
+    SRV3Pen *pen = av_malloc(sizeof(SRV3Pen));
+    if (!pen)
+        return AVERROR(ENOMEM);
+    memcpy(pen, &srv3_default_pen, sizeof(SRV3Pen));
+    pen->next = ctx->pens;
+    ctx->pens = pen;
+
+    for (xmlAttrPtr attr = element->properties; attr; attr = attr->next) {
+        if (!strcmp(attr->name, "id"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->id, 0, INT_MAX);
+        else if (!strcmp(attr->name, "sz"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->font_size, 0, INT_MAX);
+        else if (!strcmp(attr->name, "fs"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->font_style, 1, 7);
+        else if (!strcmp(attr->name, "et"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->edge_type, 1, 4);
+        else if (!strcmp(attr->name, "ec"))
+            srv3_parse_color_attr(ctx, "pen", attr, &pen->edge_color);
+        else if (!strcmp(attr->name, "fc"))
+            srv3_parse_color_attr(ctx, "pen", attr, &pen->foreground_color);
+        else if (!strcmp(attr->name, "fo"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->foreground_alpha, 0, 0xFF);
+        else if (!strcmp(attr->name, "bc"))
+            srv3_parse_color_attr(ctx, "pen", attr, &pen->background_color);
+        else if (!strcmp(attr->name, "bo"))
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->background_alpha, 0, 0xFF);
+        else if (!strcmp(attr->name, "rb")) {
+            srv3_parse_numeric_attr(ctx, "pen", attr, &pen->ruby_part, 0, 5);
+            /*
+            * For whatever reason three seems to be an unused value for this enum.
+            */
+            if (pen->ruby_part == 3) {
+                pen->ruby_part = 0;
+                av_log(ctx, AV_LOG_WARNING, "Encountered unknown ruby part 3\n");
+            }
+        } else if (!strcmp(attr->name, "i"))
+            pen->attrs |= (!strcmp(attr->children->content, "1")) * SRV3_PEN_ATTR_ITALIC;
+        else if (!strcmp(attr->name, "b"))
+            pen->attrs |= (!strcmp(attr->children->content, "1")) * SRV3_PEN_ATTR_BOLD;
+        else {
+            av_log(ctx, AV_LOG_WARNING, "Unhandled pen property %s\n", attr->name);
+            continue;
+        }
+    }
+
+    return 0;
+}
+
+static int srv3_read_window_pos(SRV3Context *ctx, xmlNodePtr element)
+{
+    SRV3WindowPos *wp = av_mallocz(sizeof(SRV3Pen));
+    if (!wp)
+        return AVERROR(ENOMEM);
+    wp->next = ctx->wps;
+    ctx->wps = wp;
+
+    for (xmlAttrPtr attr = element->properties; attr; attr = attr->next) {
+        if (!strcmp(attr->name, "id"))
+            srv3_parse_numeric_attr(ctx, "window pos", attr, &wp->id, 0, INT_MAX);
+        else if (!strcmp(attr->name, "ap"))
+            srv3_parse_numeric_attr(ctx, "window pos", attr, &wp->point, 0, 8);
+        else if (!strcmp(attr->name, "ah"))
+            srv3_parse_numeric_attr(ctx, "window pos", attr, &wp->x, 0, 100);
+        else if (!strcmp(attr->name, "av"))
+            srv3_parse_numeric_attr(ctx, "window pos", attr, &wp->y, 0, 100);
+        else {
+            av_log(ctx, AV_LOG_WARNING, "Unhandled window pos property %s\n", attr->name);
+            continue;
+        }
+    }
+
+    return 0;
+}
+
+static int srv3_read_pens(SRV3Context *ctx, xmlNodePtr head)
+{
+    int ret;
+
+    for (xmlNodePtr element = head->children; element; element = element->next) {
+        if (!strcmp(element->name, "pen")) {
+            if ((ret = srv3_read_pen(ctx, element)) < 0)
+                return ret;
+        } else if (!strcmp(element->name, "wp")) {
+            if ((ret = srv3_read_window_pos(ctx, element)) < 0)
+                return ret;
+        }
+    }
+
+    return 0;
+}
+
+#define ZERO_WIDTH_SPACE "\u200B"
+#define YTSUBCONV_PADDING_SPACE ZERO_WIDTH_SPACE " " ZERO_WIDTH_SPACE
+
+static int srv3_clean_segment_text(char *text) {
+    char *out = text;
+    const char *start = text;
+
+    while (1) {
+        const char *end = strstr(start, ZERO_WIDTH_SPACE);
+        size_t cnt = end ? (size_t)(end - start) : (size_t)strlen(start);
+
+        memmove(out, start, cnt);
+        out += cnt;
+
+        if (end) {
+            if (!av_strstart(end, YTSUBCONV_PADDING_SPACE, &start))
+                start = end + strlen(ZERO_WIDTH_SPACE);
+        } else break;
+    }
+
+    *out = '\0';
+    return out - text;
+}
+
+static int srv3_read_body(SRV3Context *ctx, xmlNodePtr body)
+{
+    int ret = 0;
+    AVBPrint textbuf;
+    char *text;
+    AVPacket *sub;
+    SRV3WindowPos *wp;
+    SRV3EventMeta *event;
+    int start, duration;
+
+    av_bprint_init(&textbuf, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (xmlNodePtr element = body->children; element; element = element->next) {
+        if (!strcmp(element->name, "p")) {
+            SRV3Segment **segments_tail_next;
+            SRV3GlobalSegments *global_segments;
+            int textlen, lastlen = 0;
+            SRV3Pen *event_pen = &srv3_default_pen;
+
+            if ((event = av_mallocz(sizeof(SRV3EventMeta))) == NULL) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            segments_tail_next = &event->segments;
+
+            for (xmlAttrPtr attr = element->properties; attr; attr = attr->next) {
+                if (!strcmp(attr->name, "t"))
+                    srv3_parse_numeric_attr(ctx, "event", attr, &start, 0, INT_MAX);
+                else if (!strcmp(attr->name, "d"))
+                    srv3_parse_numeric_attr(ctx, "event", attr, &duration, 0, INT_MAX);
+                else if (!strcmp(attr->name, "wp")) {
+                    int id;
+                    srv3_parse_numeric_attr(ctx, "event", attr, &id, 0, INT_MAX);
+                    for (wp = ctx->wps; wp; wp = wp->next)
+                        if (wp->id == id) {
+                            event->wp = wp;
+                            break;
+                        }
+                    if (!event->wp)
+                        av_log(ctx, AV_LOG_WARNING, "Non-existent window pos %i assigned to event\n", id);
+                } else if (!strcmp(attr->name, "p")) {
+                    int id;
+                    if(srv3_parse_numeric_attr(ctx, "event", attr, &id, 0, INT_MAX)) {
+                        SRV3Pen *pen = srv3_get_pen(ctx, id);
+                        if(pen)
+                            event_pen = pen;
+                        else
+                            av_log(ctx, AV_LOG_WARNING, "Non-existent pen %i assigned to event\n", id);
+                    }
+                } else if (!strcmp(attr->name, "ws")) {
+                    // TODO: Handle window styles
+                } else {
+                    av_log(ctx, AV_LOG_WARNING, "Unhandled event property %s\n", attr->name);
+                    continue;
+                }
+            }
+
+            for (xmlNodePtr node = element->children; node; node = node->next) {
+                SRV3Segment *segment;
+
+                if (node->type != XML_ELEMENT_NODE && node->type != XML_TEXT_NODE) {
+                    av_log(ctx, AV_LOG_WARNING, "Unexpected event child node type %i\n", node->type);
+                    continue;
+                } else if(node->type == XML_ELEMENT_NODE && strcmp(node->name, "s")) {
+                    av_log(ctx, AV_LOG_WARNING, "Unknown event child node name %s\n", node->name);
+                    continue;
+                } else if (node->type == XML_ELEMENT_NODE && !node->children)
+                    continue;
+
+                text = node->type == XML_ELEMENT_NODE ? node->children->content : node->content;
+                textlen = srv3_clean_segment_text(text);
+
+                if(textlen == 0)
+                    continue;
+
+                segment = av_mallocz(sizeof(SRV3Segment));
+                if (!segment) {
+                    ret = AVERROR(ENOMEM);
+                    goto end;
+                }
+
+                segment->pen = event_pen;
+
+                if (node->type == XML_ELEMENT_NODE)
+                    for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) {
+                        if (!strcmp(attr->name, "p")) {
+                            int id;
+                            if(srv3_parse_numeric_attr(ctx, "segment", attr, &id, 0, INT_MAX)) {
+                                SRV3Pen *pen = srv3_get_pen(ctx, id);
+                                if(pen)
+                                    segment->pen = pen;
+                                else
+                                    av_log(ctx, AV_LOG_WARNING, "Non-existent pen %i assigned to segment\n", id);
+                            }
+                        } else {
+                            av_log(ctx, AV_LOG_WARNING, "Unhandled segment property %s\n", attr->name);
+                            continue;
+                        }
+                    }
+
+                av_bprint_append_data(&textbuf, text, textlen);
+
+                segment->size = textbuf.len - lastlen;
+                lastlen = textbuf.len;
+                *segments_tail_next = segment;
+                segments_tail_next = &segment->next;
+            }
+
+            if (!av_bprint_is_complete(&textbuf)) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+
+            global_segments = av_mallocz(sizeof(SRV3GlobalSegments));
+            if (!global_segments) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            global_segments->list = event->segments;
+            global_segments->next = ctx->segments;
+            ctx->segments = global_segments;
+
+            sub = ff_subtitles_queue_insert(&ctx->q, textbuf.str, textbuf.len, 0);
+            if (!sub) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            sub->pts = start;
+            sub->duration = duration;
+
+            if ((ret = av_packet_add_side_data(sub, AV_PKT_DATA_SRV3_EVENT, (uint8_t*)event, sizeof(SRV3EventMeta))) < 0)
+               goto end;
+
+            av_bprint_clear(&textbuf);
+        }
+    }
+
+end:
+    av_bprint_finalize(&textbuf, NULL);
+    return ret;
+}
+
+static int srv3_read_header(AVFormatContext *s)
+{
+    int ret = 0;
+    SRV3Context *ctx = s->priv_data;
+    AVPacketSideData *head_sd;
+    SRV3Head *head;
+    AVBPrint content;
+    xmlDocPtr document = NULL;
+    xmlNodePtr root_element;
+    AVStream *st;
+
+    av_bprint_init(&content, 0, INT_MAX);
+
+    st = avformat_new_stream(s, NULL);
+    if (!st) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+    avpriv_set_pts_info(st, 64, 1, 1000);
+    st->codecpar->codec_type = AVMEDIA_TYPE_SUBTITLE;
+    st->codecpar->codec_id   = AV_CODEC_ID_SRV3;
+    st->disposition = AV_DISPOSITION_CAPTIONS;
+
+    if (!(head_sd = av_packet_side_data_new(&st->codecpar->coded_side_data, &st->codecpar->nb_coded_side_data, AV_PKT_DATA_SRV3_HEAD, sizeof(SRV3Head), 0))) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+    head = (SRV3Head*)head_sd->data;
+
+    if ((ret = avio_read_to_bprint(s->pb, &content, SIZE_MAX)) < 0)
+        goto end;
+    if (!avio_feof(s->pb) || !av_bprint_is_complete(&content)) {
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+
+    LIBXML_TEST_VERSION;
+
+    document = xmlReadMemory(content.str, content.len, s->url, NULL, 0);
+
+    if (!document) {
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+
+    root_element = xmlDocGetRootElement(document);
+
+    for (xmlAttrPtr attr = root_element->properties; attr; attr = attr->next) {
+        if (!strcmp(attr->name, "format")) {
+            if (!attr->children || strcmp(attr->children->content, "3"))
+                av_log(s, AV_LOG_WARNING, "Unrecognized timedtext format version: %s\nParsing will still be attempted but may produce unexpected results\n", attr->children->content);
+        }
+    }
+
+    ctx->pens = &srv3_default_pen;
+
+    for (xmlNodePtr element = root_element->children; element; element = element->next) {
+        if (!strcmp(element->name, "head"))
+            if ((ret = srv3_read_pens(ctx, element)) < 0)
+                goto end;
+    }
+
+    for (xmlNodePtr element = root_element->children; element; element = element->next) {
+        if (!strcmp(element->name, "body"))
+            if ((ret = srv3_read_body(ctx, element)) < 0)
+                goto end;
+    }
+
+    head->pens = ctx->pens;
+    ff_subtitles_queue_finalize(s, &ctx->q);
+
+end:
+    xmlFreeDoc(document);
+    av_bprint_finalize(&content, NULL);
+    return ret;
+}
+
+static int srv3_read_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    SRV3Context *ctx = s->priv_data;
+    return ff_subtitles_queue_read_packet(&ctx->q, pkt);
+}
+
+static int srv3_read_seek(AVFormatContext *s, int stream_index,
+                            int64_t min_ts, int64_t ts, int64_t max_ts, int flags)
+{
+    SRV3Context *ctx = s->priv_data;
+    return ff_subtitles_queue_seek(&ctx->q, s, stream_index,
+                                   min_ts, ts, max_ts, flags);
+}
+
+static av_cold int srv3_read_close(AVFormatContext *s)
+{
+    SRV3Context *ctx = s->priv_data;
+    ff_subtitles_queue_clean(&ctx->q);
+    srv3_free_context_data(ctx);
+    return 0;
+}
+
+static const AVOption options[] = {
+    { NULL }
+};
+
+static const AVClass srv3_demuxer_class = {
+    .class_name  = "SRV3 demuxer",
+    .option      = options,
+    .version     = LIBAVUTIL_VERSION_INT,
+};
+
+const FFInputFormat ff_srv3_demuxer = {
+    .p.name         = "srv3",
+    .p.long_name    = NULL_IF_CONFIG_SMALL("SRV3 subtitle"),
+    .p.extensions   = "srv3",
+    .p.priv_class   = &srv3_demuxer_class,
+    .priv_data_size = sizeof(SRV3Context),
+    .flags_internal = FF_INFMT_FLAG_INIT_CLEANUP,
+    .read_probe     = srv3_probe,
+    .read_header    = srv3_read_header,
+    .read_packet    = srv3_read_packet,
+    .read_seek2     = srv3_read_seek,
+    .read_close     = srv3_read_close,
+};
diff --git a/libavformat/version.h b/libavformat/version.h
index cf0489562e..979952183c 100644
--- a/libavformat/version.h
+++ b/libavformat/version.h
@@ -31,8 +31,8 @@
 
 #include "version_major.h"
 
-#define LIBAVFORMAT_VERSION_MINOR   9
-#define LIBAVFORMAT_VERSION_MICRO 101
+#define LIBAVFORMAT_VERSION_MINOR  10
+#define LIBAVFORMAT_VERSION_MICRO 100
 
 #define LIBAVFORMAT_VERSION_INT AV_VERSION_INT(LIBAVFORMAT_VERSION_MAJOR, \
                                                LIBAVFORMAT_VERSION_MINOR, \
-- 
2.47.0