[FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks

Mon May 22 19:42:07 EEST 2023

Le maanantaina 22. toukokuuta 2023, 17.48.40 EEST Arnie Chang a écrit :
> --- /dev/null
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -0,0 +1,40 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> +    int flags = av_get_cpu_flags();
> +
> +    if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +        c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> +        c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> +    }

You should check that the vector length is large enough:
`ff_get_rv_vlenb() >= 16`

> +#endif
> +}
> +
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S new file mode 100644
> index 0000000000..1c373c8cc7
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -0,0 +1,306 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +#include "libavutil/riscv/asm.S"
> +
> +.macro  h264_chroma_mc8 type
> +func h264_\type\()_chroma_mc8_rvv, zvl128b

That works, but `zve32x` would be more idiomatic and consistent with existing code. Selecting a vector length in the assembler doesn't really do anything other than enable `zve32x` implicitly anyway.

> +        slliw           t2, a5, 3

Don't use narrow AL unless it is really necessary. AFAICT, the C compiler will sign-extend `a5` to XLEN bits, so you should not need to care.

But if you do really need 32- rather than XLEN-bit instructions, then you should gate the code: `#if (__riscv_xlen >= 64)`

> +        mulw            t1, a5, a4
> +        sh3add          a5, a4, t2
> +        slliw           a4, a4, 3
> +        subw            a5, t1, a5
> +        subw            a7, a4, t1
> +        addiw           a6, a5, 64
> +        subw            t0, t2, t1
> +        vsetivli        t3, 8, e8, m1, ta, mu
> +        beqz            t1, 2f
> +        blez            a3, 8f
> +        li              t4, 0
> +        li              t2, 0
> +        li              t5, 1
> +        addi            a5, t3, 1
> +        slli            t3, a2, 2
> +1:                                # if (xy != 0)
> +        add             a4, a1, t4
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        addiw           t2, t2, 4
> +        vle8.v          v10, (a4)
> +        add             a4, a4, a2
> +        vslide1down.vx  v11, v10, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v8, v10, a6
> +        vwmaccu.vx      v8, a7, v11

From a quick glance, the code seems to be using between a quarter and half of the vector bank, so it would be preferable to use exclusively even-numbered registers. Then we can double LMUL easily later if that turns out faster.

> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vle8.v          v12, (a4)
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        add             a4, a4, a2
> +        vwmaccu.vx      v8, t0, v12
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vslide1down.vx  v13, v12, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v10, v12, a6
> +        vwmaccu.vx      v8, t1, v13
> +        vwmaccu.vx      v10, a7, v13
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vle8.v          v14, (a4)
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        add             a4, a4, a2
> +        vwmaccu.vx      v10, t0, v14
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vslide1down.vx  v15, v14, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v12, v14, a6
> +        vwmaccu.vx      v10, t1, v15
> +        vwmaccu.vx      v12, a7, v15
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vle8.v          v14, (a4)
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        add             a4, a4, a2
> +        vwmaccu.vx      v12, t0, v14
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vslide1down.vx  v15, v14, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v16, v14, a6
> +        vwmaccu.vx      v12, t1, v15
> +        vwmaccu.vx      v16, a7, v15
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vle8.v          v14, (a4)
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        add             a4, a0, t4
> +        add             t4, t4, t3
> +        vwmaccu.vx      v16, t0, v14
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vslide1down.vx  v14, v14, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vnclipu.wi      v15, v8, 6
> +        vwmaccu.vx      v16, t1, v14
> +  .ifc \type,avg
> +        vle8.v          v9, (a4)
> +        vaaddu.vv       v15, v15, v9
> +  .endif
> +        vse8.v          v15, (a4)
> +        add             a4, a4, a2
> +        vnclipu.wi      v8, v10, 6
> +  .ifc \type,avg
> +        vle8.v          v9, (a4)
> +        vaaddu.vv       v8, v8, v9
> +  .endif
> +        vse8.v          v8, (a4)
> +        add             a4, a4, a2
> +        vnclipu.wi      v8, v12, 6
> +  .ifc \type,avg
> +        vle8.v          v9, (a4)
> +        vaaddu.vv       v8, v8, v9
> +  .endif
> +        vse8.v          v8, (a4)
> +        add             a4, a4, a2
> +        vnclipu.wi      v8, v16, 6
> +  .ifc \type,avg
> +        vle8.v          v9, (a4)
> +        vaaddu.vv       v8, v8, v9
> +  .endif
> +        vse8.v          v8, (a4)
> +        blt             t2, a3, 1b
> +        j               8f
> +2:
> +        bnez            a4, 4f
> +        beqz            t2, 4f
> +        blez            a3, 8f
> +        li              a4, 0
> +        li              t1, 0
> +        slli            a7, a2, 2
> +3:                                # if ((x8 - xy) == 0 && (y8 -xy) != 0)
> +        add             a5, a1, a4
> +        vsetvli         zero, zero, e8, m1, ta, ma
> +        addiw           t1, t1, 4
> +        vle8.v          v8, (a5)
> +        add             a5, a5, a2
> +        add             t2, a5, a2
> +        vwmulu.vx       v10, v8, a6
> +        vle8.v          v8, (a5)
> +        vwmulu.vx       v12, v8, a6
> +        vle8.v          v9, (t2)
> +        add             t2, t2, a2
> +        add             a5, t2, a2
> +        vwmaccu.vx      v10, t0, v8
> +        vle8.v          v8, (t2)
> +        vle8.v          v14, (a5)
> +        add             a5, a0, a4
> +        add             a4, a4, a7
> +        vwmaccu.vx      v12, t0, v9
> +        vnclipu.wi      v15, v10, 6
> +        vwmulu.vx       v10, v9, a6
> +  .ifc \type,avg
> +        vle8.v          v16, (a5)
> +        vaaddu.vv       v15, v15, v16
> +  .endif
> +        vse8.v          v15, (a5)

The store depends on the previous add, which depends on the previous load. That might presumably cause some pipeline delay depending on the IP. You may want to reorder independent vector instructions a little bit.

> +        add             a5, a5, a2
> +        vnclipu.wi      v9, v12, 6
> +        vwmaccu.vx      v10, t0, v8
> +        vwmulu.vx       v12, v8, a6
> +  .ifc \type,avg
> +        vle8.v          v16, (a5)
> +        vaaddu.vv       v9, v9, v16
> +  .endif
> +        vse8.v          v9, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v10, 6
> +        vwmaccu.vx      v12, t0, v14
> +  .ifc \type,avg
> +        vle8.v          v16, (a5)
> +        vaaddu.vv       v8, v8, v16
> +  .endif
> +        vse8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v12, 6
> +  .ifc \type,avg
> +        vle8.v          v16, (a5)
> +        vaaddu.vv       v8, v8, v16
> +  .endif
> +        vse8.v          v8, (a5)
> +        blt             t1, a3, 3b
> +        j               8f
> +4:
> +        beqz            a4, 6f
> +        bnez            t2, 6f
> +        blez            a3, 8f
> +        li              a4, 0
> +        li              t2, 0
> +        addi            t0, t3, 1
> +        slli            t1, a2, 2
> +5:                               # if ((x8 - xy) != 0 && (y8 -xy) == 0)
> +        add             a5, a1, a4
> +        vsetvli         zero, t0, e8, m1, ta, ma
> +        addiw           t2, t2, 4
> +        vle8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vslide1down.vx  v9, v8, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v10, v8, a6
> +        vwmaccu.vx      v10, a7, v9
> +        vsetvli         zero, t0, e8, m1, ta, ma
> +        vle8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vslide1down.vx  v9, v8, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v12, v8, a6
> +        vwmaccu.vx      v12, a7, v9
> +        vsetvli         zero, t0, e8, m1, ta, ma
> +        vle8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vslide1down.vx  v9, v8, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v14, v8, a6
> +        vwmaccu.vx      v14, a7, v9
> +        vsetvli         zero, t0, e8, m1, ta, ma
> +        vle8.v          v8, (a5)
> +        add             a5, a0, a4
> +        add             a4, a4, t1
> +        vslide1down.vx  v9, v8, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vnclipu.wi      v16, v10, 6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v16, v16, v18
> +  .endif
> +        vse8.v          v16, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v10, v12, 6
> +        vwmulu.vx       v12, v8, a6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v10, v10, v18
> +  .endif
> +        vse8.v          v10, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v14, 6
> +        vwmaccu.vx      v12, a7, v9
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v12, 6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        blt             t2, a3, 5b
> +        j               8f
> +6:
> +        blez            a3, 8f
> +        li              a4, 0
> +        li              t2, 0
> +        slli            a7, a2, 2
> +7:                               # the final else, none of the above
> conditions are met +        add             t0, a1, a4
> +        vsetvli         zero, zero, e8, m1, ta, ma
> +        add             a5, a0, a4
> +        add             a4, a4, a7
> +        addiw           t2, t2, 4
> +        vle8.v          v8, (t0)
> +        add             t0, t0, a2
> +        add             t1, t0, a2
> +        vwmulu.vx       v10, v8, a6
> +        vle8.v          v8, (t0)
> +        add             t0, t1, a2
> +        vle8.v          v9, (t1)
> +        vle8.v          v12, (t0)
> +        vnclipu.wi      v13, v10, 6
> +        vwmulu.vx       v10, v8, a6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v13, v13, v18
> +  .endif
> +        vse8.v          v13, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v10, 6
> +        vwmulu.vx       v10, v9, a6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v10, 6
> +        vwmulu.vx       v10, v12, a6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v10, 6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        blt             t2, a3, 7b
> +8:
> +        ret
> +endfunc
> +.endm
> +
> +h264_chroma_mc8 put
> +h264_chroma_mc8 avg

-- 
Rémi Denis-Courmont
http://www.remlab.net/