[FFmpeg-devel] [PATCH] avformat/rtcenc: Add WHIP muxer support for subsecond latency streaming

Tue May 30 11:44:06 EEST 2023

On Mon, 29 May 2023 at 12:51, Steven Liu <lq at chinaffmpeg.org> wrote:

> Co-authored-by: winlin <winlinvip at gmail.com>
> Co-authored-by: yangrtc <yangrtc at aliyun.com>
> Co-authored-by: cloudwebrtc <duanweiwei1982 at gmail.com>
> Co-authored-by: Haibo Chen <495810242 at qq.com>
> Signed-off-by: Steven Liu <lq at chinaffmpeg.org>
> ---
>  configure                |    1 +
>  doc/muxers.texi          |   50 +
>  libavformat/Makefile     |    1 +
>  libavformat/allformats.c |    1 +
>  libavformat/http.c       |    6 +
>  libavformat/http.h       |    2 +
>  libavformat/rtcenc.c     | 2208 ++++++++++++++++++++++++++++++++++++++
>  7 files changed, 2269 insertions(+)
>  create mode 100644 libavformat/rtcenc.c
>
> diff --git a/configure b/configure
> index 495493aa0e..526ddc0bd6 100755
> --- a/configure
> +++ b/configure
> @@ -3483,6 +3483,7 @@ ogg_demuxer_select="dirac_parse"
>  ogv_muxer_select="ogg_muxer"
>  opus_muxer_select="ogg_muxer"
>  psp_muxer_select="mov_muxer"
> +rtc_muxer_deps_any="openssl"
>  rtp_demuxer_select="sdp_demuxer"
>  rtp_mpegts_muxer_select="mpegts_muxer rtp_muxer"
>  rtpdec_select="asf_demuxer mov_demuxer mpegts_demuxer rm_demuxer
> rtp_protocol srtp"
> diff --git a/doc/muxers.texi b/doc/muxers.texi
> index 31fca17dd6..00052f51b4 100644
> --- a/doc/muxers.texi
> +++ b/doc/muxers.texi
> @@ -1333,6 +1333,56 @@ Set custom HTTP headers, can override built in
> default headers. Applicable only
>
>  @end table
>
> + at anchor{rtc}
> + at section rtc
> +
> +WebRTC (Real-Time Communication) muxer that supports sub-second latency
> streaming according to
> +the WHIP (WebRTC-HTTP ingestion protocol) specification.
> +
> +It uses HTTP as a signaling protocol to exchange SDP capabilities and ICE
> lite candidates. Then,
> +it uses STUN binding requests and responses to establish a session over
> UDP. Subsequently, it
> +initiates a DTLS handshake to exchange the SRTP encryption keys. Lastly,
> it splits video and
> +audio frames into RTP packets and encrypts them using SRTP.
> +
> +Ensure that you use H.264 without B frames and Opus for the audio codec.
> For example, to convert
> +an input file with @command{ffmpeg} to WebRTC:
> + at example
> +ffmpeg -re -i input.mp4 -acodec libopus -ar 48000 -ac 2 \
> +  -vcodec libx264 -profile:v baseline -tune zerolatency -threads 1 -bf 0 \
> +  -f rtc "http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream"
> + at end example
> +
> +For this example, we have employed low latency options, resulting in an
> end-to-end latency of
> +approximately 150ms.
> +
> + at subsection Options
> +
> +This muxer supports the following options:
> +
> + at table @option
> +
> + at item ice_arq_max @var{size}
> +Set the maximum number of retransmissions for the ICE ARQ mechanism.
> +Default value is 5.
> +
> + at item ice_arq_timeout @var{size}
> +Set the start timeout in milliseconds for the ICE ARQ mechanism.
> +Default value is 30.
> +
> + at item dtls_arq_max @var{size}
> +Set the maximum number of retransmissions for the DTLS ARQ mechanism.
> +Default value is 5.
> +
> + at item dtls_arq_timeout @var{size}
> +Set the start timeout in milliseconds for the DTLS ARQ mechanism.
> +Default value is 50.
> +
> + at item pkt_size @var{size}
> +Set the maximum size, in bytes, of RTP packets that send out.
> +Default value is 1500.
> +
> + at end table
> +
>  @anchor{ico}
>  @section ico
>
> diff --git a/libavformat/Makefile b/libavformat/Makefile
> index f8ad7c6a11..2a76e613b0 100644
> --- a/libavformat/Makefile
> +++ b/libavformat/Makefile
> @@ -493,6 +493,7 @@ OBJS-$(CONFIG_RSD_DEMUXER)               += rsd.o
>  OBJS-$(CONFIG_RPL_DEMUXER)               += rpl.o
>  OBJS-$(CONFIG_RSO_DEMUXER)               += rsodec.o rso.o pcm.o
>  OBJS-$(CONFIG_RSO_MUXER)                 += rsoenc.o rso.o rawenc.o
> +OBJS-$(CONFIG_RTC_MUXER)                 += rtcenc.o avc.o http.o srtp.o
>  OBJS-$(CONFIG_RTP_MPEGTS_MUXER)          += rtpenc_mpegts.o
>  OBJS-$(CONFIG_RTP_MUXER)                 += rtp.o         \
>                                              rtpenc_aac.o     \
> diff --git a/libavformat/allformats.c b/libavformat/allformats.c
> index efdb34e29d..56e852e5ea 100644
> --- a/libavformat/allformats.c
> +++ b/libavformat/allformats.c
> @@ -392,6 +392,7 @@ extern const AVInputFormat  ff_rpl_demuxer;
>  extern const AVInputFormat  ff_rsd_demuxer;
>  extern const AVInputFormat  ff_rso_demuxer;
>  extern const FFOutputFormat ff_rso_muxer;
> +extern const FFOutputFormat ff_rtc_muxer;
>  extern const AVInputFormat  ff_rtp_demuxer;
>  extern const FFOutputFormat ff_rtp_muxer;
>  extern const FFOutputFormat ff_rtp_mpegts_muxer;
> diff --git a/libavformat/http.c b/libavformat/http.c
> index 0817aafb5b..6f509656b5 100644
> --- a/libavformat/http.c
> +++ b/libavformat/http.c
> @@ -530,6 +530,12 @@ int ff_http_averror(int status_code, int
> default_averror)
>          return default_averror;
>  }
>
> +const char* ff_http_get_new_location(URLContext *h)
> +{
> +    HTTPContext *s = h->priv_data;
> +    return s->new_location;
> +}
> +
>  static int http_write_reply(URLContext* h, int status_code)
>  {
>      int ret, body = 0, reply_code, message_len;
> diff --git a/libavformat/http.h b/libavformat/http.h
> index 5f650ef143..d1b691826b 100644
> --- a/libavformat/http.h
> +++ b/libavformat/http.h
> @@ -62,4 +62,6 @@ int ff_http_do_new_request2(URLContext *h, const char
> *uri, AVDictionary **optio
>
>  int ff_http_averror(int status_code, int default_averror);
>
> +const char* ff_http_get_new_location(URLContext *h);
> +
>  #endif /* AVFORMAT_HTTP_H */
> diff --git a/libavformat/rtcenc.c b/libavformat/rtcenc.c
> new file mode 100644
> index 0000000000..65173d0073
> --- /dev/null
> +++ b/libavformat/rtcenc.c
> @@ -0,0 +1,2208 @@
> +/*
> + * WebRTC-HTTP ingestion protocol (WHIP) muxer
> + * Copyright (c) 2023 The FFmpeg Project
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#ifndef CONFIG_OPENSSL
> +#error "DTLS is not supported, please enable openssl"
> +#else
> +#include <openssl/ssl.h>
> +#include <openssl/err.h>
> +#if OPENSSL_VERSION_NUMBER < 0x1010102fL
> +#error "OpenSSL version 1.1.1b or newer is required"
> +#endif
> +#endif
> +
> +#include "libavcodec/avcodec.h"
> +#include "libavutil/base64.h"
> +#include "libavutil/bprint.h"
> +#include "libavutil/crc.h"
> +#include "libavutil/hmac.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/random_seed.h"
> +#include "libavutil/time.h"
> +#include "avc.h"
> +#include "avio_internal.h"
> +#include "http.h"
> +#include "internal.h"
> +#include "mux.h"
> +#include "network.h"
> +#include "srtp.h"
> +
> +/**
> + * Maximum size limit of a Session Description Protocol (SDP),
> + * be it an offer or answer.
> + */
> +#define MAX_SDP_SIZE 8192
>

?

> +/**
> + * Maximum size of the buffer for sending and receiving UDP packets.
> + * Please note that this size does not limit the size of the UDP packet
> that can be sent.
> + * To set the limit for packet size, modify the `pkt_size` parameter.
> + * For instance, it is possible to set the UDP buffer to 4096 to send or
> receive packets,
> + * but please keep in mind that the `pkt_size` option limits the packet
> size to 1400.
> + */
> +#define MAX_UDP_BUFFER_SIZE 4096
> +/*
>
>

> +
> +/**
> + * Parses the ISOM AVCC format of extradata and extracts SPS/PPS.
> + *
> + * This function is used to parse SPS/PPS from the extradata in ISOM AVCC
> format.
> + * It can handle both ISOM and annexb formats but only parses data in
> ISOM format.
> + * If the extradata is in annexb format, this function ignores it, and
> uses the entire
> + * extradata as a sequence header with SPS/PPS. Refer to
> ff_isom_write_avcc.
> + *
> + * @param s                Pointer to the AVFormatContext
> + * @param extradata        Pointer to the extradata
> + * @param extradata_size   Size of the extradata
> + * @returns                Returns 0 if successful or AVERROR_xxx in case
> of an error.
> + */
> +static int isom_read_avcc(AVFormatContext *s, uint8_t *extradata, int
> extradata_size)
> +{
> +    int ret = 0;
> +    uint8_t version, nal_length_size, nb_sps, nb_pps;
> +    AVIOContext *pb;
> +    RTCContext *rtc = s->priv_data;
> +
> +    if (!extradata || !extradata_size)
> +        return 0;
> +
> +    /* Not H.264 ISOM format, may be annexb etc. */
> +    if (extradata_size < 4 || extradata[0] != 1) {
> +        if (!ff_avc_find_startcode(extradata, extradata +
> extradata_size)) {
> +            av_log(s, AV_LOG_ERROR, "Format must be ISOM or annexb\n");
> +            return AVERROR_INVALIDDATA;
> +        }
> +        return 0;
> +    }
> +
> +    /* Parse the SPS/PPS in ISOM format in extradata. */
> +    pb = avio_alloc_context(extradata, extradata_size, 0, NULL, NULL,
> NULL, NULL);
> +    if (!pb)
> +        return AVERROR(ENOMEM);
> +
> +    version = avio_r8(pb); /* version */
> +    avio_r8(pb); /* avc profile */
> +    avio_r8(pb); /* avc profile compat */
> +    avio_r8(pb); /* avc level */
> +    nal_length_size = avio_r8(pb); /* 6 bits reserved (111111) + 2 bits
> nal size length - 1 (11) */
> +    nb_sps = avio_r8(pb); /* 3 bits reserved (111) + 5 bits number of sps
> */
> +
> +    if (version != 1) {
> +        av_log(s, AV_LOG_ERROR, "Invalid version=%d\n", version);
> +        ret = AVERROR_INVALIDDATA;
> +        goto end;
> +    }
> +
> +    rtc->avc_nal_length_size = (nal_length_size & 0x03) + 1;
> +    if (rtc->avc_nal_length_size == 3) {
> +        av_log(s, AV_LOG_ERROR, "Invalid nal length size=%d\n",
> rtc->avc_nal_length_size);
> +        ret = AVERROR_INVALIDDATA;
> +        goto end;
> +    }
> +
> +    /* Read SPS */
> +    nb_sps &= 0x1f;
> +    if (nb_sps != 1 || avio_feof(pb)) {
> +        av_log(s, AV_LOG_ERROR, "Invalid number of sps=%d, eof=%d\n",
> nb_sps, avio_feof(pb));
> +        ret = AVERROR_INVALIDDATA;
> +        goto end;
> +    }
> +
> +    rtc->avc_sps_size = avio_rb16(pb); /* sps size */
> +    if (rtc->avc_sps_size <= 0 || avio_feof(pb)) {
> +        av_log(s, AV_LOG_ERROR, "Invalid sps size=%d, eof=%d\n",
> rtc->avc_sps_size, avio_feof(pb));
> +        ret = AVERROR_INVALIDDATA;
> +        goto end;
> +    }
> +
> +    rtc->avc_sps = av_malloc(rtc->avc_sps_size);
> +    if (!rtc->avc_sps) {
> +        ret = AVERROR(ENOMEM);
> +        goto end;
> +    }
> +
> +    ret = avio_read(pb, rtc->avc_sps, rtc->avc_sps_size); /* sps */
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to read sps, size=%d\n",
> rtc->avc_sps_size);
> +        goto end;
> +    }
> +
> +    /* Read PPS */
> +    nb_pps = avio_r8(pb); /* number of pps */
> +    if (nb_pps != 1 || avio_feof(pb)) {
> +        av_log(s, AV_LOG_ERROR, "Invalid number of pps=%d, eof=%d\n",
> nb_pps, avio_feof(pb));
> +        ret = AVERROR_INVALIDDATA;
> +        goto end;
> +    }
> +
> +    rtc->avc_pps_size = avio_rb16(pb); /* pps size */
> +    if (rtc->avc_pps_size <= 0 || avio_feof(pb)) {
> +        av_log(s, AV_LOG_ERROR, "Invalid pps size=%d, eof=%d\n",
> rtc->avc_pps_size, avio_feof(pb));
> +        ret = AVERROR_INVALIDDATA;
> +        goto end;
> +    }
> +
> +    rtc->avc_pps = av_malloc(rtc->avc_pps_size);
> +    if (!rtc->avc_pps) {
> +        ret = AVERROR(ENOMEM);
> +        goto end;
> +    }
> +
> +    ret = avio_read(pb, rtc->avc_pps, rtc->avc_pps_size); /* pps */
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to read pps, size=%d\n",
> rtc->avc_pps_size);
> +        goto end;
> +    }
> +
> +end:
> +    avio_context_free(&pb);
> +    return ret;
> +}
>

This code is duplicated from other parts of FFmpeg

 +/**

> + * Generate SDP offer according to the codec parameters, DTLS and ICE
> information.
> + * The below is an example of SDP offer:
> + *
> + *       v=0
> + *       o=FFmpeg 4489045141692799359 2 IN IP4 127.0.0.1
> + *       s=FFmpegPublishSession
> + *       t=0 0
> + *       a=group:BUNDLE 0 1
> + *       a=extmap-allow-mixed
> + *       a=msid-semantic: WMS
> + *
> + *       m=audio 9 UDP/TLS/RTP/SAVPF 111
> + *       c=IN IP4 0.0.0.0
> + *       a=ice-ufrag:a174B
> + *       a=ice-pwd:wY8rJ3gNLxL3eWZs6UPOxy
> + *       a=fingerprint:sha-256
> EE:FE:A2:E5:6A:21:78:60:71:2C:21:DC:1A:2C:98:12:0C:E8:AD:68:07:61:1B:0E:FC:46:97:1E:BC:97:4A:54
> + *       a=setup:actpass
> + *       a=mid:0
> + *       a=sendonly
> + *       a=msid:FFmpeg audio
> + *       a=rtcp-mux
> + *       a=rtpmap:111 opus/48000/2
> + *       a=ssrc:4267647086 cname:FFmpeg
> + *       a=ssrc:4267647086 msid:FFmpeg audio
> + *
> + *       m=video 9 UDP/TLS/RTP/SAVPF 106
> + *       c=IN IP4 0.0.0.0
> + *       a=ice-ufrag:a174B
> + *       a=ice-pwd:wY8rJ3gNLxL3eWZs6UPOxy
> + *       a=fingerprint:sha-256
> EE:FE:A2:E5:6A:21:78:60:71:2C:21:DC:1A:2C:98:12:0C:E8:AD:68:07:61:1B:0E:FC:46:97:1E:BC:97:4A:54
> + *       a=setup:actpass
> + *       a=mid:1
> + *       a=sendonly
> + *       a=msid:FFmpeg video
> + *       a=rtcp-mux
> + *       a=rtcp-rsize
> + *       a=rtpmap:106 H264/90000
> + *       a=fmtp:106
> level-asymmetry-allowed=1;packetization-mode=1;profile-level-id=42e01f
> + *       a=ssrc:107169110 cname:FFmpeg
> + *       a=ssrc:107169110 msid:FFmpeg video
> + *
>

This is not a legal SDP, each line must end only with one space (yes, this
is regularly wrong).

>
>
> +
> +/**
> + * Opens the UDP transport and completes the ICE handshake, using fast
> retransmit to
> + * handle packet loss for the binding request.
> + *
> + * To initiate a fast retransmission of the STUN binding request during
> ICE, we wait only
> + * for a successful local ICE process i.e., when a binding response is
> received from the
> + * server. Since the server's binding request may not arrive, we do not
> always wait for it.
> + * However, we will always respond to the server's binding request during
> ICE, DTLS or
> + * RTP streaming.
> + *
> + * @param s Pointer to the AVFormatContext
> + * @return Returns 0 if the handshake was successful or AVERROR_xxx in
> case of an error
> + */
> +static int ice_handshake(AVFormatContext *s)
> +{
> +    int ret, size;
> +    char url[256], tmp[16];
> +    char req_buf[MAX_UDP_BUFFER_SIZE], res_buf[MAX_UDP_BUFFER_SIZE];
> +    RTCContext *rtc = s->priv_data;
> +    int fast_retries = rtc->ice_arq_max, timeout = rtc->ice_arq_timeout;
> +
> +    /* Build UDP URL and create the UDP context as transport. */
> +    ff_url_join(url, sizeof(url), "udp", NULL, rtc->ice_host,
> rtc->ice_port, NULL);
> +    ret = ffurl_alloc(&rtc->udp_uc, url, AVIO_FLAG_WRITE,
> &s->interrupt_callback);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to open udp://%s:%d\n",
> rtc->ice_host, rtc->ice_port);
> +        goto end;
> +    }
> +
> +    av_opt_set(rtc->udp_uc->priv_data, "connect", "1", 0);
> +    av_opt_set(rtc->udp_uc->priv_data, "fifo_size", "0", 0);
> +    /* Set the max packet size to the buffer size. */
> +    snprintf(tmp, sizeof(tmp), "%d", rtc->pkt_size);
> +    av_opt_set(rtc->udp_uc->priv_data, "pkt_size", tmp, 0);
> +
> +    ret = ffurl_connect(rtc->udp_uc, NULL);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to connect udp://%s:%d\n",
> rtc->ice_host, rtc->ice_port);
> +        goto end;
> +    }
> +
> +    /* Make the socket non-blocking, set to READ and WRITE mode after
> connected */
> +    ff_socket_nonblock(ffurl_get_file_handle(rtc->udp_uc), 1);
> +    rtc->udp_uc->flags |= AVIO_FLAG_READ | AVIO_FLAG_NONBLOCK;
> +
> +    /* Build the STUN binding request. */
> +    ret = ice_create_request(s, req_buf, sizeof(req_buf), &size);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to create STUN binding request,
> size=%d\n", size);
> +        goto end;
> +    }
> +
> +    /* Fast retransmit the STUN binding request. */
> +    while (1) {
> +        ret = ffurl_write(rtc->udp_uc, req_buf, size);
> +        if (ret < 0) {
> +            av_log(s, AV_LOG_ERROR, "Failed to send STUN binding request,
> size=%d\n", size);
> +            goto end;
> +        }
> +
> +        /* Wait so that the server can process the request and no need
> ARQ then. */
> +#if ICE_PROCESSING_TIMEOUT > 0
> +        av_usleep(ICE_PROCESSING_TIMEOUT * 10000);
> +#endif
> +
> +        /* Read the STUN binding response. */
> +        ret = ffurl_read(rtc->udp_uc, res_buf, sizeof(res_buf));
> +        if (ret < 0) {
> +            /* If max retries is 6 and start timeout is 21ms, the total
> timeout
> +             * is about 21 + 42 + 84 + 168 + 336 + 672 = 1263ms. */
> +            av_usleep(timeout * 1000);
> +            timeout *= 2;
> +
> +            if (ret == AVERROR(EAGAIN) && fast_retries) {
> +                fast_retries--;
> +                continue;
> +            }
> +
> +            av_log(s, AV_LOG_ERROR, "Failed to read STUN binding
> response, retries=%d\n", rtc->ice_arq_max);
> +            goto end;
> +        }
> +
> +        /* If got any binding response, the fast retransmission is done.
> */
> +        if (ice_is_binding_response(res_buf, ret))
> +            break;
> +
> +        /* When a binding request is received, it is necessary to respond
> immediately. */
> +        if (ice_is_binding_request(res_buf, ret) && (ret =
> ice_handle_binding_request(s, res_buf, ret)) < 0)
> +            goto end;
> +    }
> +
> +    /* Wait just for a small while to get the possible binding request
> from server. */
> +    fast_retries = rtc->ice_arq_max / 2;
> +    timeout = rtc->ice_arq_timeout;
> +    while (fast_retries) {
> +        ret = ffurl_read(rtc->udp_uc, res_buf, sizeof(res_buf));
> +        if (ret < 0) {
> +            /* If max retries is 6 and start timeout is 21ms, the total
> timeout
> +             * is about 21 + 42 + 84 = 147ms. */
> +            av_usleep(timeout * 1000);
> +            timeout *= 2;
> +
> +            if (ret == AVERROR(EAGAIN)) {
> +                fast_retries--;
> +                continue;
> +            }
> +
> +            av_log(s, AV_LOG_ERROR, "Failed to read STUN binding request,
> retries=%d\n", rtc->ice_arq_max);
> +            goto end;
> +        }
> +
> +        /* When a binding request is received, it is necessary to respond
> immediately. */
> +        if (ice_is_binding_request(res_buf, ret)) {
> +            if ((ret = ice_handle_binding_request(s, res_buf, ret)) < 0)
> +                goto end;
> +            break;
> +        }
> +    }
> +
> +    av_log(s, AV_LOG_INFO, "WHIP: ICE STUN ok, url=udp://%s:%d,
> location=%s, username=%s:%s, req=%dB, res=%dB, arq=%d\n",
> +        rtc->ice_host, rtc->ice_port, rtc->whip_resource_url ?
> rtc->whip_resource_url : "",
> +        rtc->ice_ufrag_remote, rtc->ice_ufrag_local, size, ret,
> +        rtc->ice_arq_max - fast_retries);
> +    ret = 0;
> +
> +end:
> +    return ret;
> +}
> +
> +/**
> + * Establish the SRTP context using the keying material exported from
> DTLS.
> + *
> + * Create separate SRTP contexts for sending video and audio, as their
> sequences differ
> + * and should not share a single context. Generate a single SRTP context
> for receiving
> + * RTCP only.
> + *
> + * @return 0 if OK, AVERROR_xxx on error
> + */
> +static int setup_srtp(AVFormatContext *s)
> +{
> +    int ret;
> +    char recv_key[DTLS_SRTP_MASTER_KEY_LEN],
> send_key[DTLS_SRTP_MASTER_KEY_LEN];
> +    char buf[AV_BASE64_SIZE(DTLS_SRTP_MASTER_KEY_LEN)];
> +    const char* suite = "AES_CM_128_HMAC_SHA1_80";
> +    RTCContext *rtc = s->priv_data;
> +
> +    /* As DTLS client, the send key is client master key plus salt. */
> +    memcpy(send_key, rtc->dtls_ctx.dtls_srtp_material, 16);
> +    memcpy(send_key + 16, rtc->dtls_ctx.dtls_srtp_material + 32, 14);
> +
> +    /* As DTLS client, the recv key is server master key plus salt. */
> +    memcpy(recv_key, rtc->dtls_ctx.dtls_srtp_material + 16, 16);
> +    memcpy(recv_key + 16, rtc->dtls_ctx.dtls_srtp_material + 46, 14);
> +
> +    /* Setup SRTP context for outgoing packets */
> +    if (!av_base64_encode(buf, sizeof(buf), send_key, sizeof(send_key))) {
> +        av_log(s, AV_LOG_ERROR, "Failed to encode send key\n");
> +        ret = AVERROR(EIO);
> +        goto end;
> +    }
> +
> +    ret = ff_srtp_set_crypto(&rtc->srtp_audio_send, suite, buf);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to set crypto for audio send\n");
> +        goto end;
> +    }
> +
> +    ret = ff_srtp_set_crypto(&rtc->srtp_video_send, suite, buf);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to set crypto for video send\n");
> +        goto end;
> +    }
> +
> +    ret = ff_srtp_set_crypto(&rtc->srtp_rtcp_send, suite, buf);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to set crypto for rtcp send\n");
> +        goto end;
> +    }
> +
> +    /* Setup SRTP context for incoming packets */
> +    if (!av_base64_encode(buf, sizeof(buf), recv_key, sizeof(recv_key))) {
> +        av_log(s, AV_LOG_ERROR, "Failed to encode recv key\n");
> +        ret = AVERROR(EIO);
> +        goto end;
> +    }
> +
> +    ret = ff_srtp_set_crypto(&rtc->srtp_recv, suite, buf);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to set crypto for recv\n");
> +        goto end;
> +    }
> +
> +    av_log(s, AV_LOG_INFO, "WHIP: SRTP setup done, suite=%s, key=%luB\n",
> suite, sizeof(send_key));
> +
> +end:
> +    return ret;
> +}
> +
> +/**
> + * Creates dedicated RTP muxers for each stream in the AVFormatContext to
> build RTP
> + * packets from the encoded frames.
> + *
> + * The corresponding SRTP context is utilized to encrypt each stream's
> RTP packets. For
> + * example, a video SRTP context is used for the video stream.
> Additionally, the
> + * "on_rtp_write_packet" callback function is set as the write function
> for each RTP
> + * muxer to send out encrypted RTP packets.
> + *
> + * @return 0 if OK, AVERROR_xxx on error
> + */
> +static int create_rtp_muxer(AVFormatContext *s)
> +{
> +    int ret, i, is_video, buffer_size, max_packet_size;
> +    AVFormatContext *rtp_ctx = NULL;
> +    AVDictionary *opts = NULL;
> +    uint8_t *buffer = NULL;
> +    char buf[64];
> +    RTCContext *rtc = s->priv_data;
> +
> +    const AVOutputFormat *rtp_format = av_guess_format("rtp", NULL, NULL);
> +    if (!rtp_format) {
> +        av_log(s, AV_LOG_ERROR, "Failed to guess rtp muxer\n");
> +        ret = AVERROR(ENOSYS);
> +        goto end;
> +    }
> +
> +    /* The UDP buffer size, may greater than MTU. */
> +    buffer_size = MAX_UDP_BUFFER_SIZE;
> +    /* The RTP payload max size. Reserved some bytes for SRTP checksum
> and padding. */
> +    max_packet_size = rtc->pkt_size - DTLS_SRTP_CHECKSUM_LEN;
> +
> +    for (i = 0; i < s->nb_streams; i++) {
> +        rtp_ctx = avformat_alloc_context();
> +        if (!rtp_ctx) {
> +            ret = AVERROR(ENOMEM);
> +            goto end;
> +        }
> +
> +        rtp_ctx->oformat = rtp_format;
> +        if (!avformat_new_stream(rtp_ctx, NULL)) {
> +            ret = AVERROR(ENOMEM);
> +            goto end;
> +        }
> +        /* Pass the interrupt callback on */
> +        rtp_ctx->interrupt_callback = s->interrupt_callback;
> +        /* Copy the max delay setting; the rtp muxer reads this. */
> +        rtp_ctx->max_delay = s->max_delay;
> +        /* Copy other stream parameters. */
> +        rtp_ctx->streams[0]->sample_aspect_ratio =
> s->streams[i]->sample_aspect_ratio;
> +        rtp_ctx->flags |= s->flags & AVFMT_FLAG_BITEXACT;
> +        rtp_ctx->strict_std_compliance = s->strict_std_compliance;
> +
> +        /* Set the synchronized start time. */
> +        rtp_ctx->start_time_realtime = s->start_time_realtime;
> +
> +        avcodec_parameters_copy(rtp_ctx->streams[0]->codecpar,
> s->streams[i]->codecpar);
> +        rtp_ctx->streams[0]->time_base = s->streams[i]->time_base;
> +
> +        buffer = av_malloc(buffer_size);
> +        if (!buffer) {
> +            ret = AVERROR(ENOMEM);
> +            goto end;
> +        }
> +
> +        rtp_ctx->pb = avio_alloc_context(buffer, buffer_size, 1, s, NULL,
> on_rtp_write_packet, NULL);
> +        if (!rtp_ctx->pb) {
> +            ret = AVERROR(ENOMEM);
> +            goto end;
> +        }
> +        rtp_ctx->pb->max_packet_size = max_packet_size;
> +        rtp_ctx->pb->av_class = &ff_avio_class;
> +
> +        is_video = s->streams[i]->codecpar->codec_type ==
> AVMEDIA_TYPE_VIDEO;
> +        snprintf(buf, sizeof(buf), "%d", is_video?
> rtc->video_payload_type : rtc->audio_payload_type);
> +        av_dict_set(&opts, "payload_type", buf, 0);
> +        snprintf(buf, sizeof(buf), "%d", is_video? rtc->video_ssrc :
> rtc->audio_ssrc);
> +        av_dict_set(&opts, "ssrc", buf, 0);
> +
> +        ret = avformat_write_header(rtp_ctx, &opts);
> +        if (ret < 0) {
> +            av_log(s, AV_LOG_ERROR, "Failed to write rtp header\n");
> +            goto end;
> +        }
> +
> +        ff_format_set_url(rtp_ctx, av_strdup(s->url));
> +        s->streams[i]->time_base = rtp_ctx->streams[0]->time_base;
> +        s->streams[i]->priv_data = rtp_ctx;
> +        rtp_ctx = NULL;
> +    }
> +
> +    av_log(s, AV_LOG_INFO, "WHIP: Create RTP muxer OK, buffer_size=%d,
> max_packet_size=%d\n",
> +        buffer_size, max_packet_size);
> +
> +end:
> +    if (rtp_ctx)
> +        avio_context_free(&rtp_ctx->pb);
> +    avformat_free_context(rtp_ctx);
> +    av_dict_free(&opts);
> +    return ret;
> +}
> +
> +/**
> + * Callback triggered by the RTP muxer when it creates and sends out an
> RTP packet.
> + *
> + * This function modifies the video STAP packet, removing the markers,
> and updating the
> + * NRI of the first NALU. Additionally, it uses the corresponding SRTP
> context to encrypt
> + * the RTP packet, where the video packet is handled by the video SRTP
> context.
> + */
> +static int on_rtp_write_packet(void *opaque, uint8_t *buf, int buf_size)
> +{
> +    int ret, cipher_size, is_rtcp, is_video;
> +    uint8_t payload_type, nalu_header;
> +    char cipher[MAX_UDP_BUFFER_SIZE];
> +    AVFormatContext *s = opaque;
> +    RTCContext *rtc = s->priv_data;
> +    struct SRTPContext *srtp;
> +
> +    /* Ignore if not RTP or RTCP packet. */
> +    if (buf_size < 12 || (buf[0] & 0xC0) != 0x80)
> +        return 0;
> +
> +    /* Only support audio, video and rtcp. */
> +    is_rtcp = buf[1] >= 192 && buf[1] <= 223;
> +    payload_type = buf[1] & 0x7f;
> +    is_video = payload_type == rtc->video_payload_type;
> +    if (!is_rtcp && payload_type != rtc->video_payload_type &&
> payload_type != rtc->audio_payload_type)
> +        return 0;
> +
> +    /**
> +     * For video, the STAP-A with SPS/PPS should:
> +     * 1. The marker bit should be 0, never be 1.
> +     * 2. The NRI should equal to the first NALU's.
> +     */
> +    if (is_video && buf_size > 12) {
> +        nalu_header = buf[12] & 0x1f;
> +        if (nalu_header == NALU_TYPE_STAP_A) {
> +            /* Reset the marker bit to 0. */
> +            if (buf[1] & 0x80)
> +                buf[1] &= 0x7f;
> +
> +            /* Reset the NRI to the first NALU's NRI. */
> +            if (buf_size > 15 && (buf[15]&0x60) != (buf[12]&0x60))
> +                buf[12] = (buf[12]&0x80) | (buf[15]&0x60) |
> (buf[12]&0x1f);
> +        }
> +    }
> +
> +    /* Get the corresponding SRTP context. */
> +    srtp = is_rtcp ? &rtc->srtp_rtcp_send : (is_video?
> &rtc->srtp_video_send : &rtc->srtp_audio_send);
> +
> +    /* Encrypt by SRTP and send out. */
> +    cipher_size = ff_srtp_encrypt(srtp, buf, buf_size, cipher,
> sizeof(cipher));
> +    if (cipher_size <= 0 || cipher_size < buf_size) {
> +        av_log(s, AV_LOG_WARNING, "Failed to encrypt packet=%dB,
> cipher=%dB\n", buf_size, cipher_size);
> +        return 0;
> +    }
> +
> +    ret = ffurl_write(rtc->udp_uc, cipher, cipher_size);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to write packet=%dB, ret=%d\n",
> cipher_size, ret);
> +        return ret;
> +    }
> +
> +    return ret;
> +}
> +
> +/**
> + * Inserts the SPS/PPS data before each IDR (Instantaneous Decoder
> Refresh) frame.
> + *
> + * The SPS/PPS is parsed from the extradata. If it's in ISOM format, the
> SPS/PPS is
> + * multiplexed to the data field of the packet. If it's in annexb format,
> then the entire
> + * extradata is set to the data field of the packet.
> + */
> +static int insert_sps_pps_packet(AVFormatContext *s, AVPacket *pkt)
> +{
> +    int ret, is_idr, size, i;
> +    uint8_t *p;
> +    AVPacket* extra = NULL;
> +    AVStream *st = s->streams[pkt->stream_index];
> +    AVFormatContext *rtp_ctx = st->priv_data;
> +    RTCContext *rtc = s->priv_data;
> +
> +    is_idr = (pkt->flags & AV_PKT_FLAG_KEY) && st->codecpar->codec_type
> == AVMEDIA_TYPE_VIDEO;
> +    if (!is_idr || !st->codecpar->extradata)
> +        return 0;
> +
> +    extra = av_packet_alloc();
> +    if (!extra)
> +        return AVERROR(ENOMEM);
> +
> +    size = !rtc->avc_nal_length_size ? st->codecpar->extradata_size :
> +            rtc->avc_nal_length_size * 2 + rtc->avc_sps_size +
> rtc->avc_pps_size;
> +    ret = av_new_packet(extra, size);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to allocate extra packet\n");
> +        goto end;
> +    }
> +
> +    /* Encode SPS/PPS in annexb format. */
> +    if (!rtc->avc_nal_length_size) {
> +        memcpy(extra->data, st->codecpar->extradata, size);
> +    } else {
> +        /* Encode SPS/PPS in ISOM format. */
> +        p = extra->data;
> +        for (i = 0; i < rtc->avc_nal_length_size; i++) {
> +            *p++ = rtc->avc_sps_size >> (8 * (rtc->avc_nal_length_size -
> i - 1));
> +        }
> +        memcpy(p, rtc->avc_sps, rtc->avc_sps_size);
> +        p += rtc->avc_sps_size;
> +
> +        /* Encode PPS in ISOM format. */
> +        for (i = 0; i < rtc->avc_nal_length_size; i++) {
> +            *p++ = rtc->avc_pps_size >> (8 * (rtc->avc_nal_length_size -
> i - 1));
> +        }
> +        memcpy(p, rtc->avc_pps, rtc->avc_pps_size);
> +        p += rtc->avc_pps_size;
> +    }
> +
> +    /* Setup packet and feed it to chain. */
> +    extra->pts = pkt->pts;
> +    extra->dts = pkt->dts;
> +    extra->stream_index = pkt->stream_index;
> +    extra->time_base = pkt->time_base;
> +
> +    ret = ff_write_chained(rtp_ctx, 0, extra, s, 0);
> +    if (ret < 0)
> +        goto end;
> +
> +end:
> +    av_packet_free(&extra);
> +    return ret;
> +}
> +
> +/**
> + * RTC is connectionless, for it's based on UDP, so it check whether
> sesison is
> + * timeout. In such case, publishers can't republish the stream util the
> session
> + * is timeout.
> + * This function is called to notify the server that the stream is ended,
> server
> + * should expire and close the session immediately, so that publishers
> can republish
> + * the stream quickly.
> + */
> +static int whip_dispose(AVFormatContext *s)
> +{
> +    int ret;
> +    char buf[MAX_URL_SIZE];
> +    URLContext *whip_uc = NULL;
> +    RTCContext *rtc = s->priv_data;
> +
> +    if (!rtc->whip_resource_url)
> +        return 0;
> +
> +    ret = ffurl_alloc(&whip_uc, rtc->whip_resource_url,
> AVIO_FLAG_READ_WRITE, &s->interrupt_callback);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to alloc WHIP delete context:
> %s\n", s->url);
> +        goto end;
> +    }
> +
> +    av_opt_set(whip_uc->priv_data, "chunked_post", "0", 0);
> +    av_opt_set(whip_uc->priv_data, "method", "DELETE", 0);
> +    ret = ffurl_connect(whip_uc, NULL);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to DELETE url=%s\n",
> rtc->whip_resource_url);
> +        goto end;
> +    }
> +
> +    while (1) {
> +        ret = ffurl_read(whip_uc, buf, sizeof(buf));
> +        if (ret == AVERROR_EOF) {
> +            ret = 0;
> +            break;
> +        }
> +        if (ret < 0) {
> +            av_log(s, AV_LOG_ERROR, "Failed to read response from DELETE
> url=%s\n", rtc->whip_resource_url);
> +            goto end;
> +        }
> +    }
> +
> +    av_log(s, AV_LOG_INFO, "WHIP: Dispose resource %s ok\n",
> rtc->whip_resource_url);
> +
> +end:
> +    ffurl_closep(&whip_uc);
> +    return ret;
> +}
> +
> +static av_cold int rtc_init(AVFormatContext *s)
> +{
> +    int ret;
> +    RTCContext *rtc = s->priv_data;
> +
> +    if ((ret = whip_init(s)) < 0)
> +        return ret;
> +
> +    if ((ret = parse_codec(s)) < 0)
> +        return ret;
> +
> +    if ((ret = generate_sdp_offer(s)) < 0)
> +        return ret;
> +
> +    if ((ret = exchange_sdp(s)) < 0)
> +        return ret;
> +
> +    if ((ret = parse_answer(s)) < 0)
> +        return ret;
> +
> +    if ((ret = ice_handshake(s)) < 0)
> +        return ret;
> +
> +    /* Now UDP URL context is ready, setup the DTLS transport. */
> +    rtc->dtls_ctx.udp_uc = rtc->udp_uc;
> +
> +    if ((ret = dtls_context_handshake(&rtc->dtls_ctx)) < 0)
> +        return ret;
> +
> +    if ((ret = setup_srtp(s)) < 0)
> +        return ret;
> +
> +    if ((ret = create_rtp_muxer(s)) < 0)
> +        return ret;
> +
> +    return ret;
> +}
> +
> +static int rtc_write_packet(AVFormatContext *s, AVPacket *pkt)
> +{
> +    int ret;
> +    RTCContext *rtc = s->priv_data;
> +    AVStream *st = s->streams[pkt->stream_index];
> +    AVFormatContext *rtp_ctx = st->priv_data;
> +
> +    /* TODO: Send binding request every 1s as WebRTC heartbeat. */
> +    /* TODO: Receive packets from the server such as ICE binding
> requests, DTLS messages,
> +     * and RTCP like PLI requests, then respond to them.*/
> +
> +    /* For audio OPUS stream, correct the timestamp. */
> +    if (st->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
> +        pkt->dts = pkt->pts = rtc->audio_jitter_base;
> +        rtc->audio_jitter_base += 960;
> +    }
>

Correct how?

+    ret = insert_sps_pps_packet(s, pkt);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR, "Failed to insert SPS/PPS packet\n");
> +        return ret;
> +    }
> +
> +    ret = ff_write_chained(rtp_ctx, 0, pkt, s, 0);
> +    if (ret < 0) {
> +        if (ret == AVERROR(EINVAL)) {
> +            av_log(s, AV_LOG_WARNING, "Ignore failed to write packet=%dB,
> ret=%d\n", pkt->size, ret);
> +            ret = 0;
> +        } else
> +            av_log(s, AV_LOG_ERROR, "Failed to write packet, size=%d\n",
> pkt->size);
> +        return ret;
> +    }
> +
> +    return ret;
> +}
> +
> +static av_cold void rtc_deinit(AVFormatContext *s)
> +{
> +    int i, ret;
> +    RTCContext *rtc = s->priv_data;
> +
> +    ret = whip_dispose(s);
> +    if (ret < 0)
> +        av_log(s, AV_LOG_WARNING, "Failed to dispose resource, ret=%d\n",
> ret);
> +
> +    for (i = 0; i < s->nb_streams; i++) {
> +        AVFormatContext* rtp_ctx = s->streams[i]->priv_data;
> +        if (!rtp_ctx)
> +            continue;
> +
> +        av_write_trailer(rtp_ctx);
> +        avio_context_free(&rtp_ctx->pb);
> +        avformat_free_context(rtp_ctx);
> +        s->streams[i]->priv_data = NULL;
> +    }
> +
> +    av_freep(&rtc->avc_sps);
> +    av_freep(&rtc->avc_pps);
> +    av_freep(&rtc->sdp_offer);
> +    av_freep(&rtc->sdp_answer);
> +    av_freep(&rtc->whip_resource_url);
> +    av_freep(&rtc->ice_ufrag_remote);
> +    av_freep(&rtc->ice_pwd_remote);
> +    av_freep(&rtc->ice_protocol);
> +    av_freep(&rtc->ice_host);
> +    ffurl_closep(&rtc->udp_uc);
> +    ff_srtp_free(&rtc->srtp_audio_send);
> +    ff_srtp_free(&rtc->srtp_video_send);
> +    ff_srtp_free(&rtc->srtp_rtcp_send);
> +    ff_srtp_free(&rtc->srtp_recv);
> +    dtls_context_deinit(&rtc->dtls_ctx);
> +}
> +
> +#define OFFSET(x) offsetof(RTCContext, x)
> +#define DEC AV_OPT_FLAG_DECODING_PARAM
> +static const AVOption options[] = {
> +    { "ice_arq_max",        "Maximum number of retransmissions for the
> ICE ARQ mechanism",      OFFSET(ice_arq_max),        AV_OPT_TYPE_INT,    {
> .i64 = 5 },       -1, INT_MAX, DEC },
> +    { "ice_arq_timeout",    "Start timeout in milliseconds for the ICE
> ARQ mechanism",          OFFSET(ice_arq_timeout),    AV_OPT_TYPE_INT,    {
> .i64 = 30 },      -1, INT_MAX, DEC },
> +    { "dtls_arq_max",       "Maximum number of retransmissions for the
> DTLS ARQ mechanism",     OFFSET(dtls_arq_max),       AV_OPT_TYPE_INT,    {
> .i64 = 5 },       -1, INT_MAX, DEC },
> +    { "dtls_arq_timeout",   "Start timeout in milliseconds for the DTLS
> ARQ mechanism",         OFFSET(dtls_arq_timeout),   AV_OPT_TYPE_INT,    {
> .i64 = 50 },     -1, INT_MAX, DEC },
> +    { "pkt_size",           "The maximum size, in bytes, of RTP packets
> that send out",         OFFSET(pkt_size),           AV_OPT_TYPE_INT,    {
> .i64 = 1500 },    -1, INT_MAX, DEC },
> +    { NULL },
> +};
> +
> +static const AVClass rtc_muxer_class = {
> +    .class_name = "WebRTC muxer",
> +    .item_name  = av_default_item_name,
> +    .option     = options,
> +    .version    = LIBAVUTIL_VERSION_INT,
> +};
> +
> +const FFOutputFormat ff_rtc_muxer = {
> +    .p.name             = "rtc",
> +    .p.long_name        = NULL_IF_CONFIG_SMALL("WHIP WebRTC muxer"),
> +    .p.audio_codec      = AV_CODEC_ID_OPUS,
> +    .p.video_codec      = AV_CODEC_ID_H264,
> +    .p.flags            = AVFMT_GLOBALHEADER | AVFMT_NOFILE,
> +    .p.priv_class       = &rtc_muxer_class,
> +    .priv_data_size     = sizeof(RTCContext),
> +    .init               = rtc_init,
> +    .write_packet       = rtc_write_packet,
> +    .deinit             = rtc_deinit,
> +};
> --
> 2.40.0
>

This patchset is massive and unmaintainable. Not to mention only implements
a small portion of webrtc, that will inevitably get patched to add more
features and become a giant sprawl.

Kieran