[FFmpeg-devel] [PATCH 06/21] swscale/output: add AYUV output support

Sat Oct 12 02:05:37 EEST 2024

On Fri, Oct 11, 2024 at 07:54:48PM -0300, James Almer wrote:
> On 10/11/2024 7:46 PM, Michael Niedermayer wrote:
> > On Tue, Oct 08, 2024 at 07:50:11PM -0300, James Almer wrote:
> > > Signed-off-by: James Almer <jamrial at gmail.com>
> > > ---
> > >   libswscale/output.c                      | 323 ++++++++++++-----------
> > >   libswscale/utils.c                       |   2 +-
> > >   tests/ref/fate/filter-pixdesc-ayuv       |   1 +
> > >   tests/ref/fate/filter-pixfmts-copy       |   1 +
> > >   tests/ref/fate/filter-pixfmts-crop       |   1 +
> > >   tests/ref/fate/filter-pixfmts-field      |   1 +
> > >   tests/ref/fate/filter-pixfmts-fieldorder |   1 +
> > >   tests/ref/fate/filter-pixfmts-hflip      |   1 +
> > >   tests/ref/fate/filter-pixfmts-il         |   1 +
> > >   tests/ref/fate/filter-pixfmts-null       |   1 +
> > >   tests/ref/fate/filter-pixfmts-pad        |   1 +
> > >   tests/ref/fate/filter-pixfmts-scale      |   1 +
> > >   tests/ref/fate/filter-pixfmts-transpose  |   1 +
> > >   tests/ref/fate/filter-pixfmts-vflip      |   1 +
> > >   14 files changed, 183 insertions(+), 154 deletions(-)
> > >   create mode 100644 tests/ref/fate/filter-pixdesc-ayuv
> > > 
> > > diff --git a/libswscale/output.c b/libswscale/output.c
> > > index c9dfd6f60a..328b108089 100644
> > > --- a/libswscale/output.c
> > > +++ b/libswscale/output.c
> > > @@ -2668,165 +2668,177 @@ yuv2xv36le_X_c(SwsContext *c, const int16_t *lumFilter,
> > >       }
> > >   }
> > > -static void
> > > -yuv2vuyX_1_c(SwsContext *c, const int16_t *buf0,
> > > -             const int16_t *ubuf[2], const int16_t *vbuf[2],
> > > -             const int16_t *abuf0, uint8_t *dest, int dstW,
> > > -             int uvalpha, int y)
> > > -{
> > > -    int hasAlpha = !!abuf0;
> > > -    int i;
> > > -
> > > -    if (uvalpha < 2048) {
> > > -        for (i = 0; i < dstW; i++) {
> > > -            int Y = (buf0[i] + 64) >> 7;
> > > -            int U = (ubuf[0][i] + 64) >> 7;
> > > -            int V = (vbuf[0][i] + 64) >> 7;
> > > -            int A = 255;
> > > -
> > > -            if (Y & 0x100)
> > > -                Y = av_clip_uint8(Y);
> > > -            if (U & 0x100)
> > > -                U = av_clip_uint8(U);
> > > -            if (V & 0x100)
> > > -                V = av_clip_uint8(V);
> > > -
> > > -            if (hasAlpha) {
> > > -                A = (abuf0[i] + 64) >> 7;
> > > -                if (A & 0x100)
> > > -                    A = av_clip_uint8(A);
> > > -            }
> > > -
> > > -            dest[4 * i    ] = V;
> > > -            dest[4 * i + 1] = U;
> > > -            dest[4 * i + 2] = Y;
> > > -            dest[4 * i + 3] = A;
> > > -        }
> > > -    } else {
> > > -        for (i = 0; i < dstW; i++) {
> > > -            int Y = (buf0[i] + 64) >> 7;
> > > -            int U = (ubuf[0][i] + ubuf[1][i] + 128) >> 8;
> > > -            int V = (vbuf[0][i] + vbuf[1][i] + 128) >> 8;
> > > -            int A = 255;
> > > -
> > > -            if (Y & 0x100)
> > > -                Y = av_clip_uint8(Y);
> > > -            if (U & 0x100)
> > > -                U = av_clip_uint8(U);
> > > -            if (V & 0x100)
> > > -                V = av_clip_uint8(V);
> > > -
> > > -            if (hasAlpha) {
> > > -                A = (abuf0[i] + 64) >> 7;
> > > -                if (A & 0x100)
> > > -                    A = av_clip_uint8(A);
> > > -            }
> > > -
> > > -            dest[4 * i    ] = V;
> > > -            dest[4 * i + 1] = U;
> > > -            dest[4 * i + 2] = Y;
> > > -            dest[4 * i + 3] = A;
> > > -        }
> > > -    }
> > > +#define AYUV_1_WRAPPER(fmt, C0, C1, C2, C3)                        \
> > > +static void                                                        \
> > > +yuv2 ## fmt ##_1_c(SwsContext *c, const int16_t *buf0,             \
> > > +                   const int16_t *ubuf[2], const int16_t *vbuf[2], \
> > > +                   const int16_t *abuf0, uint8_t *dest, int dstW,  \
> > > +                   int uvalpha, int y)                             \
> > > +{                                                                  \
> > > +    int hasAlpha = !!abuf0;                                        \
> > > +    int i;                                                         \
> > > +                                                                   \
> > > +    if (uvalpha < 2048) {                                          \
> > > +        for (i = 0; i < dstW; i++) {                               \
> > > +            int Y = (buf0[i] + 64) >> 7;                           \
> > > +            int U = (ubuf[0][i] + 64) >> 7;                        \
> > > +            int V = (vbuf[0][i] + 64) >> 7;                        \
> > > +            int A = 255;                                           \
> > > +                                                                   \
> > > +            if (Y & 0x100)                                         \
> > > +                Y = av_clip_uint8(Y);                              \
> > > +            if (U & 0x100)                                         \
> > > +                U = av_clip_uint8(U);                              \
> > > +            if (V & 0x100)                                         \
> > > +                V = av_clip_uint8(V);                              \
> > > +                                                                   \
> > > +            if (hasAlpha) {                                        \
> > > +                A = (abuf0[i] + 64) >> 7;                          \
> > > +                if (A & 0x100)                                     \
> > > +                    A = av_clip_uint8(A);                          \
> > > +            }                                                      \
> > > +                                                                   \
> > > +            dest[4 * i    ] = (C0);                                \
> > > +            dest[4 * i + 1] = (C1);                                \
> > > +            dest[4 * i + 2] = (C2);                                \
> > > +            dest[4 * i + 3] = (C3);                                \
> > > +        }                                                          \
> > > +    } else {                                                       \
> > > +        for (i = 0; i < dstW; i++) {                               \
> > > +            int Y = (buf0[i] + 64) >> 7;                           \
> > > +            int U = (ubuf[0][i] + ubuf[1][i] + 128) >> 8;          \
> > > +            int V = (vbuf[0][i] + vbuf[1][i] + 128) >> 8;          \
> > > +            int A = 255;                                           \
> > > +                                                                   \
> > > +            if (Y & 0x100)                                         \
> > > +                Y = av_clip_uint8(Y);                              \
> > > +            if (U & 0x100)                                         \
> > > +                U = av_clip_uint8(U);                              \
> > > +            if (V & 0x100)                                         \
> > > +                V = av_clip_uint8(V);                              \
> > > +                                                                   \
> > > +            if (hasAlpha) {                                        \
> > > +                A = (abuf0[i] + 64) >> 7;                          \
> > > +                if (A & 0x100)                                     \
> > > +                    A = av_clip_uint8(A);                          \
> > > +            }                                                      \
> > > +                                                                   \
> > > +            dest[4 * i    ] = (C0);                                \
> > > +            dest[4 * i + 1] = (C1);                                \
> > > +            dest[4 * i + 2] = (C2);                                \
> > > +            dest[4 * i + 3] = (C3);                                \
> > > +        }                                                          \
> > > +    }                                                              \
> > >   }
> > 
> > Is there an advantage in using huge multiline macros here ?
> > 
> > This is ugly and hard to maintain code. Simply writing a always inline function
> > and trusting that the compiler will inline it should result in more normal
> > C code and the same result
> > 
> > (is it faster ? or has some other advanatge ?)
> 
> No, just figured doing it like this. I can make it an always inline
> function.

please do, we have a few slight differnt ways its done currently, heres one example
(and in this example, in fact output_pixels could be itself replaced by a function,
 which would probably be cleaner too)

In fact everything can be cleaned up and i certainly would love to see someone
have a brilliant idea to make it cleaner with no disadvanatges ...

#define output_pixels(pos, Y1, U, Y2, V) \
    if (target == AV_PIX_FMT_YUYV422) { \
        dest[pos + 0] = Y1; \
        dest[pos + 1] = U;  \
        dest[pos + 2] = Y2; \
        dest[pos + 3] = V;  \
    } else if (target == AV_PIX_FMT_YVYU422) { \
        dest[pos + 0] = Y1; \
        dest[pos + 1] = V;  \
        dest[pos + 2] = Y2; \
        dest[pos + 3] = U;  \
    } else { /* AV_PIX_FMT_UYVY422 */ \
        dest[pos + 0] = U;  \
        dest[pos + 1] = Y1; \
        dest[pos + 2] = V;  \
        dest[pos + 3] = Y2; \
    }

static av_always_inline void
yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
                     const int16_t **lumSrc, int lumFilterSize,
                     const int16_t *chrFilter, const int16_t **chrUSrc,
                     const int16_t **chrVSrc, int chrFilterSize,
                     const int16_t **alpSrc, uint8_t *dest, int dstW,
                     int y, enum AVPixelFormat target)
{
    int i;

    for (i = 0; i < ((dstW + 1) >> 1); i++) {
        int j;
        int Y1 = 1 << 18;
        int Y2 = 1 << 18;
        int U  = 1 << 18;
        int V  = 1 << 18;

        for (j = 0; j < lumFilterSize; j++) {
            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
        }
        for (j = 0; j < chrFilterSize; j++) {
            U += chrUSrc[j][i] * chrFilter[j];
            V += chrVSrc[j][i] * chrFilter[j];
        }
        Y1 >>= 19;
        Y2 >>= 19;
        U  >>= 19;
        V  >>= 19;
        if ((Y1 | Y2 | U | V) & 0x100) {
            Y1 = av_clip_uint8(Y1);
            Y2 = av_clip_uint8(Y2);
            U  = av_clip_uint8(U);
            V  = av_clip_uint8(V);
        }
        output_pixels(4*i, Y1, U, Y2, V);
    }
}

static av_always_inline void
yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
                     const int16_t *ubuf[2], const int16_t *vbuf[2],
                     const int16_t *abuf[2], uint8_t *dest, int dstW,
                     int yalpha, int uvalpha, int y,
                     enum AVPixelFormat target)
{
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
    int  yalpha1 = 4096 - yalpha;
    int uvalpha1 = 4096 - uvalpha;
    int i;
    av_assert2(yalpha  <= 4096U);
    av_assert2(uvalpha <= 4096U);

    for (i = 0; i < ((dstW + 1) >> 1); i++) {
        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;

        if ((Y1 | Y2 | U | V) & 0x100) {
            Y1 = av_clip_uint8(Y1);
            Y2 = av_clip_uint8(Y2);
            U  = av_clip_uint8(U);
            V  = av_clip_uint8(V);
        }

        output_pixels(i * 4, Y1, U, Y2, V);
    }
}

static av_always_inline void
yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
                     const int16_t *ubuf[2], const int16_t *vbuf[2],
                     const int16_t *abuf0, uint8_t *dest, int dstW,
                     int uvalpha, int y, enum AVPixelFormat target)
{
    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
    int i;

    if (uvalpha < 2048) {
        for (i = 0; i < ((dstW + 1) >> 1); i++) {
            int Y1 = (buf0[i * 2    ]+64) >> 7;
            int Y2 = (buf0[i * 2 + 1]+64) >> 7;
            int U  = (ubuf0[i]       +64) >> 7;
            int V  = (vbuf0[i]       +64) >> 7;

            if ((Y1 | Y2 | U | V) & 0x100) {
                Y1 = av_clip_uint8(Y1);
                Y2 = av_clip_uint8(Y2);
                U  = av_clip_uint8(U);
                V  = av_clip_uint8(V);
            }

            output_pixels(i * 4, Y1, U, Y2, V);
        }
    } else {
        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
        for (i = 0; i < ((dstW + 1) >> 1); i++) {
            int Y1 = (buf0[i * 2    ]    + 64) >> 7;
            int Y2 = (buf0[i * 2 + 1]    + 64) >> 7;
            int U  = (ubuf0[i] + ubuf1[i]+128) >> 8;
            int V  = (vbuf0[i] + vbuf1[i]+128) >> 8;

            if ((Y1 | Y2 | U | V) & 0x100) {
                Y1 = av_clip_uint8(Y1);
                Y2 = av_clip_uint8(Y2);
                U  = av_clip_uint8(U);
                V  = av_clip_uint8(V);
            }

            output_pixels(i * 4, Y1, U, Y2, V);
        }
    }
}
[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

When the tyrant has disposed of foreign enemies by conquest or treaty, and
there is nothing more to fear from them, then he is always stirring up
some war or other, in order that the people may require a leader. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 195 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20241012/4142c6b5/attachment.sig>