[FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
Rémi Denis-Courmont
remi at remlab.net
Mon May 22 19:42:07 EEST 2023
Le maanantaina 22. toukokuuta 2023, 17.48.40 EEST Arnie Chang a écrit :
> --- /dev/null
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -0,0 +1,40 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> + int flags = av_get_cpu_flags();
> +
> + if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> + c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> + c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> + }
You should check that the vector length is large enough:
`ff_get_rv_vlenb() >= 16`
> +#endif
> +}
> +
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S new file mode 100644
> index 0000000000..1c373c8cc7
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -0,0 +1,306 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +#include "libavutil/riscv/asm.S"
> +
> +.macro h264_chroma_mc8 type
> +func h264_\type\()_chroma_mc8_rvv, zvl128b
That works, but `zve32x` would be more idiomatic and consistent with existing code. Selecting a vector length in the assembler doesn't really do anything other than enable `zve32x` implicitly anyway.
> + slliw t2, a5, 3
Don't use narrow AL unless it is really necessary. AFAICT, the C compiler will sign-extend `a5` to XLEN bits, so you should not need to care.
But if you do really need 32- rather than XLEN-bit instructions, then you should gate the code: `#if (__riscv_xlen >= 64)`
> + mulw t1, a5, a4
> + sh3add a5, a4, t2
> + slliw a4, a4, 3
> + subw a5, t1, a5
> + subw a7, a4, t1
> + addiw a6, a5, 64
> + subw t0, t2, t1
> + vsetivli t3, 8, e8, m1, ta, mu
> + beqz t1, 2f
> + blez a3, 8f
> + li t4, 0
> + li t2, 0
> + li t5, 1
> + addi a5, t3, 1
> + slli t3, a2, 2
> +1: # if (xy != 0)
> + add a4, a1, t4
> + vsetvli zero, a5, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v10, (a4)
> + add a4, a4, a2
> + vslide1down.vx v11, v10, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v8, v10, a6
> + vwmaccu.vx v8, a7, v11
From a quick glance, the code seems to be using between a quarter and half of the vector bank, so it would be preferable to use exclusively even-numbered registers. Then we can double LMUL easily later if that turns out faster.
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v12, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v8, t0, v12
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslide1down.vx v13, v12, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v12, a6
> + vwmaccu.vx v8, t1, v13
> + vwmaccu.vx v10, a7, v13
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v10, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslide1down.vx v15, v14, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v14, a6
> + vwmaccu.vx v10, t1, v15
> + vwmaccu.vx v12, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v12, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslide1down.vx v15, v14, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v16, v14, a6
> + vwmaccu.vx v12, t1, v15
> + vwmaccu.vx v16, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a0, t4
> + add t4, t4, t3
> + vwmaccu.vx v16, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslide1down.vx v14, v14, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v15, v8, 6
> + vwmaccu.vx v16, t1, v14
> + .ifc \type,avg
> + vle8.v v9, (a4)
> + vaaddu.vv v15, v15, v9
> + .endif
> + vse8.v v15, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + .ifc \type,avg
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + .endif
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v12, 6
> + .ifc \type,avg
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + .endif
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v16, 6
> + .ifc \type,avg
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + .endif
> + vse8.v v8, (a4)
> + blt t2, a3, 1b
> + j 8f
> +2:
> + bnez a4, 4f
> + beqz t2, 4f
> + blez a3, 8f
> + li a4, 0
> + li t1, 0
> + slli a7, a2, 2
> +3: # if ((x8 - xy) == 0 && (y8 -xy) != 0)
> + add a5, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + addiw t1, t1, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + add t2, a5, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a5)
> + vwmulu.vx v12, v8, a6
> + vle8.v v9, (t2)
> + add t2, t2, a2
> + add a5, t2, a2
> + vwmaccu.vx v10, t0, v8
> + vle8.v v8, (t2)
> + vle8.v v14, (a5)
> + add a5, a0, a4
> + add a4, a4, a7
> + vwmaccu.vx v12, t0, v9
> + vnclipu.wi v15, v10, 6
> + vwmulu.vx v10, v9, a6
> + .ifc \type,avg
> + vle8.v v16, (a5)
> + vaaddu.vv v15, v15, v16
> + .endif
> + vse8.v v15, (a5)
The store depends on the previous add, which depends on the previous load. That might presumably cause some pipeline delay depending on the IP. You may want to reorder independent vector instructions a little bit.
> + add a5, a5, a2
> + vnclipu.wi v9, v12, 6
> + vwmaccu.vx v10, t0, v8
> + vwmulu.vx v12, v8, a6
> + .ifc \type,avg
> + vle8.v v16, (a5)
> + vaaddu.vv v9, v9, v16
> + .endif
> + vse8.v v9, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmaccu.vx v12, t0, v14
> + .ifc \type,avg
> + vle8.v v16, (a5)
> + vaaddu.vv v8, v8, v16
> + .endif
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v12, 6
> + .ifc \type,avg
> + vle8.v v16, (a5)
> + vaaddu.vv v8, v8, v16
> + .endif
> + vse8.v v8, (a5)
> + blt t1, a3, 3b
> + j 8f
> +4:
> + beqz a4, 6f
> + bnez t2, 6f
> + blez a3, 8f
> + li a4, 0
> + li t2, 0
> + addi t0, t3, 1
> + slli t1, a2, 2
> +5: # if ((x8 - xy) != 0 && (y8 -xy) == 0)
> + add a5, a1, a4
> + vsetvli zero, t0, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslide1down.vx v9, v8, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v8, a6
> + vwmaccu.vx v10, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslide1down.vx v9, v8, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v8, a6
> + vwmaccu.vx v12, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslide1down.vx v9, v8, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v14, v8, a6
> + vwmaccu.vx v14, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a0, a4
> + add a4, a4, t1
> + vslide1down.vx v9, v8, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v16, v10, 6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v16, v16, v18
> + .endif
> + vse8.v v16, (a5)
> + add a5, a5, a2
> + vnclipu.wi v10, v12, 6
> + vwmulu.vx v12, v8, a6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v10, v10, v18
> + .endif
> + vse8.v v10, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v14, 6
> + vwmaccu.vx v12, a7, v9
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v12, 6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + blt t2, a3, 5b
> + j 8f
> +6:
> + blez a3, 8f
> + li a4, 0
> + li t2, 0
> + slli a7, a2, 2
> +7: # the final else, none of the above
> conditions are met + add t0, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + add a5, a0, a4
> + add a4, a4, a7
> + addiw t2, t2, 4
> + vle8.v v8, (t0)
> + add t0, t0, a2
> + add t1, t0, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (t0)
> + add t0, t1, a2
> + vle8.v v9, (t1)
> + vle8.v v12, (t0)
> + vnclipu.wi v13, v10, 6
> + vwmulu.vx v10, v8, a6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v13, v13, v18
> + .endif
> + vse8.v v13, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v9, a6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v12, a6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + blt t2, a3, 7b
> +8:
> + ret
> +endfunc
> +.endm
> +
> +h264_chroma_mc8 put
> +h264_chroma_mc8 avg
--
Rémi Denis-Courmont
http://www.remlab.net/
More information about the ffmpeg-devel
mailing list