[FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
Rémi Denis-Courmont
remi at remlab.net
Fri May 19 20:12:33 EEST 2023
Le keskiviikkona 17. toukokuuta 2023, 10.13.01 EEST Arnie Chang a écrit :
> Optimize the put and avg filtering for 8x8 chroma blocks
>
> Signed-off-by: Arnie Chang <arnie.chang at sifive.com>
> ---
> libavcodec/h264chroma.c | 2 +
> libavcodec/h264chroma.h | 1 +
> libavcodec/riscv/Makefile | 3 +
> libavcodec/riscv/h264_chroma_init_riscv.c | 39 ++
> libavcodec/riscv/h264_mc_chroma.S | 492 ++++++++++++++++++++++
> libavcodec/riscv/h264_mc_chroma.h | 34 ++
> 6 files changed, 571 insertions(+)
> create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
> create mode 100644 libavcodec/riscv/h264_mc_chroma.S
> create mode 100644 libavcodec/riscv/h264_mc_chroma.h
>
> diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
> index 60b86b6fba..1eeab7bc40 100644
> --- a/libavcodec/h264chroma.c
> +++ b/libavcodec/h264chroma.c
> @@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int
> bit_depth) ff_h264chroma_init_mips(c, bit_depth);
> #elif ARCH_LOONGARCH64
> ff_h264chroma_init_loongarch(c, bit_depth);
> +#elif ARCH_RISCV
> + ff_h264chroma_init_riscv(c, bit_depth);
> #endif
> }
> diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
> index b8f9c8f4fc..9c81c18a76 100644
> --- a/libavcodec/h264chroma.h
> +++ b/libavcodec/h264chroma.h
> @@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int
> bit_depth); void ff_h264chroma_init_x86(H264ChromaContext *c, int
> bit_depth); void ff_h264chroma_init_mips(H264ChromaContext *c, int
> bit_depth); void ff_h264chroma_init_loongarch(H264ChromaContext *c, int
> bit_depth); +void ff_h264chroma_init_riscv(H264ChromaContext *c, int
> bit_depth);
>
> #endif /* AVCODEC_H264CHROMA_H */
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 965942f4df..08b76c93cb 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -19,3 +19,6 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
> RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
> OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +
> +OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
> +RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
Please maintain the existing ordering, which is to say, alphabetical.
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c
> b/libavcodec/riscv/h264_chroma_init_riscv.c new file mode 100644
> index 0000000000..b6f98ba693
> --- /dev/null
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> + const int high_bit_depth = bit_depth > 8;
> +
> + if (!high_bit_depth) {
> + c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> + c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> + }
> +#endif
> +}
> \ No newline at end of file
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S new file mode 100644
> index 0000000000..a02866f633
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -0,0 +1,492 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> + .text
> +
> + .globl h264_put_chroma_mc8_rvv
> + .p2align 1
> + .type h264_put_chroma_mc8_rvv, at function
> +h264_put_chroma_mc8_rvv:
> + slliw t2, a5, 3
> + mulw t1, a5, a4
> + sh3add a5, a4, t2
> + slliw a4, a4, 3
> + subw a5, t1, a5
> + subw a7, a4, t1
> + addiw a6, a5, 64
> + subw t0, t2, t1
I would avoid narrow instructions unless strictly necessary, so that we don't
unnecessarily restrict target platforms.
> + vsetivli t3, 8, e8, m1, ta, mu
> + beqz t1, .LBB0_4
> + blez a3, .LBB0_17
If you're going to use numeric labels (which is totally fine, I do that too),
then you really can just use the usual numeric label assembler syntax. There
are no needs to create local branch symbols.
> + li t4, 0
> + li t2, 0
> + addi a5, t3, 1
> + slli t3, a2, 2
> +.LBB0_3: # if (xy != 0)
> + add a4, a1, t4
> + vsetvli zero, a5, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v10, (a4)
> + add a4, a4, a2
> + vslidedown.vi v11, v10, 1
Isn't vslide1down.vx zero potentially faster than vslidedown.vi 1?
> + vsetivli zero, 8, e8, m1, ta, ma
Do we really need to reconfigure the active vector length so many times? I
suspect that is not going to go down to well with some implementations.
> + vwmulu.vx v8, v10, a6
> + vwmaccu.vx v8, a7, v11
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v12, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v8, t0, v12
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v13, v12, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v12, a6
> + vwmaccu.vx v8, t1, v13
> + vwmaccu.vx v10, a7, v13
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v10, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v15, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v14, a6
> + vwmaccu.vx v10, t1, v15
> + vwmaccu.vx v12, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v12, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v15, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v16, v14, a6
> + vwmaccu.vx v12, t1, v15
> + vwmaccu.vx v16, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a0, t4
> + add t4, t4, t3
I could be totally wrong since I have no hardware to verify with, but I would
assume that it is preferable to interleave independent scalar and vector
instructions whence possible. For out-of-order processors, it shouldn't
matter, but I suppose that it would on in-order multi-issue processors.
> + vwmaccu.vx v16, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v14, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v15, v8, 6
> + vwmaccu.vx v16, t1, v14
> + vse8.v v15, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v12, 6
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v16, 6
> + vse8.v v8, (a4)
> + blt t2, a3, .LBB0_3
> + j .LBB0_17
> +.LBB0_4:
> + bnez a4, .LBB0_9
> + beqz t2, .LBB0_9
> + blez a3, .LBB0_17
> + li a4, 0
> + li t1, 0
> + slli a7, a2, 2
> +.LBB0_8: # if ((x8 - xy) == 0 && (y8 -xy) !=
> 0) + add a5, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + addiw t1, t1, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + add t2, a5, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a5)
> + vwmulu.vx v12, v8, a6
> + vle8.v v9, (t2)
> + add t2, t2, a2
> + add a5, t2, a2
> + vwmaccu.vx v10, t0, v8
> + vle8.v v8, (t2)
> + vle8.v v14, (a5)
> + add a5, a0, a4
> + add a4, a4, a7
> + vwmaccu.vx v12, t0, v9
> + vnclipu.wi v15, v10, 6
> + vwmulu.vx v10, v9, a6
> + vse8.v v15, (a5)
> + add a5, a5, a2
> + vnclipu.wi v9, v12, 6
> + vwmaccu.vx v10, t0, v8
> + vwmulu.vx v12, v8, a6
> + vse8.v v9, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmaccu.vx v12, t0, v14
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v12, 6
> + vse8.v v8, (a5)
> + blt t1, a3, .LBB0_8
> + j .LBB0_17
> +.LBB0_9:
> + beqz a4, .LBB0_14
> + bnez t2, .LBB0_14
> + blez a3, .LBB0_17
> + li a4, 0
> + li t2, 0
> + addi t0, t3, 1
> + slli t1, a2, 2
> +.LBB0_13: # if ((x8 - xy) != 0 && (y8 -xy) ==
> 0) + add a5, a1, a4
> + vsetvli zero, t0, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v8, a6
> + vwmaccu.vx v10, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v8, a6
> + vwmaccu.vx v12, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v14, v8, a6
> + vwmaccu.vx v14, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a0, a4
> + add a4, a4, t1
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v16, v10, 6
> + vse8.v v16, (a5)
> + add a5, a5, a2
> + vnclipu.wi v10, v12, 6
> + vwmulu.vx v12, v8, a6
> + vse8.v v10, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v14, 6
> + vwmaccu.vx v12, a7, v9
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v12, 6
> + vse8.v v8, (a5)
> + blt t2, a3, .LBB0_13
> + j .LBB0_17
> +.LBB0_14:
> + blez a3, .LBB0_17
> + li a4, 0
> + li t2, 0
> + slli a7, a2, 2
> +.LBB0_16: # the final else, none of the above
> conditions are met + add t0, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + add a5, a0, a4
> + add a4, a4, a7
> + addiw t2, t2, 4
> + vle8.v v8, (t0)
> + add t0, t0, a2
> + add t1, t0, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (t0)
> + add t0, t1, a2
> + vle8.v v9, (t1)
> + vle8.v v12, (t0)
> + vnclipu.wi v13, v10, 6
> + vwmulu.vx v10, v8, a6
> + vse8.v v13, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v9, a6
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v12, a6
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vse8.v v8, (a5)
> + blt t2, a3, .LBB0_16
> +.LBB0_17: # Exit h264_put_chroma_mc8_rvv
> + ret
> +.Lfunc_end0:
> + .size h264_put_chroma_mc8_rvv, .Lfunc_end0-h264_put_chroma_mc8_rvv
> +
> + .globl h264_avg_chroma_mc8_rvv
> + .p2align 1
> + .type h264_avg_chroma_mc8_rvv, at function
> +h264_avg_chroma_mc8_rvv:
> + slliw t2, a5, 3
> + mulw t1, a5, a4
> + sh3add a5, a4, t2
> + slliw a4, a4, 3
> + subw a5, t1, a5
> + subw a7, a4, t1
> + addiw a6, a5, 64
> + subw t0, t2, t1
> + vsetivli t3, 8, e8, m1, ta, mu
> + beqz t1, .LBB1_4
> + blez a3, .LBB1_17
> + li t4, 0
> + li t2, 0
> + addi a5, t3, 1
> + slli t3, a2, 2
> +.LBB1_3: # if (xy != 0)
> + add a4, a1, t4
> + vsetvli zero, a5, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v10, (a4)
> + add a4, a4, a2
> + vslidedown.vi v11, v10, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v8, v10, a6
> + vwmaccu.vx v8, a7, v11
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v12, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v8, t0, v12
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v13, v12, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v12, a6
> + vwmaccu.vx v8, t1, v13
> + vwmaccu.vx v10, a7, v13
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v10, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v15, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v14, a6
> + vwmaccu.vx v10, t1, v15
> + vwmaccu.vx v12, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v12, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v15, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v16, v14, a6
> + vwmaccu.vx v12, t1, v15
> + vwmaccu.vx v16, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a0, t4
> + add t4, t4, t3
> + vwmaccu.vx v16, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v14, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v15, v8, 6
> + vle8.v v8, (a4)
> + vwmaccu.vx v16, t1, v14
> + vaaddu.vv v8, v15, v8
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v12, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v16, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + blt t2, a3, .LBB1_3
> + j .LBB1_17
> +.LBB1_4:
> + bnez a4, .LBB1_9
> + beqz t2, .LBB1_9
> + blez a3, .LBB1_17
> + li t2, 0
> + li t1, 0
> + slli a7, a2, 2
> +.LBB1_8: # if ((x8 - xy) == 0 && (y8 -xy) !=
> 0) + add a4, a1, t2
> + vsetvli zero, zero, e8, m1, ta, ma
> + addiw t1, t1, 4
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + add a5, a4, a2
> + vle8.v v9, (a4)
> + add a4, a5, a2
> + vle8.v v12, (a5)
> + vwmaccu.vx v10, t0, v8
> + vle8.v v13, (a4)
> + add a4, a0, t2
> + add t2, t2, a7
> + vnclipu.wi v14, v10, 6
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a4)
> + vaaddu.vv v8, v14, v8
> + vwmaccu.vx v10, t0, v9
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v9, a6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vwmaccu.vx v10, t0, v12
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v12, a6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vwmaccu.vx v10, t0, v13
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + blt t1, a3, .LBB1_8
> + j .LBB1_17
> +.LBB1_9:
> + beqz a4, .LBB1_14
> + bnez t2, .LBB1_14
> + blez a3, .LBB1_17
> + li a5, 0
> + li t2, 0
> + addi t0, t3, 1
> + slli t1, a2, 2
> +.LBB1_13: # if ((x8 - xy) != 0 && (y8 -xy) ==
> 0) + add a4, a1, a5
> + vsetvli zero, t0, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v8, a6
> + vwmaccu.vx v10, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v8, a6
> + vwmaccu.vx v12, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v14, v8, a6
> + vwmaccu.vx v14, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a4)
> + add a4, a0, a5
> + add a5, a5, t1
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v16, v10, 6
> + vle8.v v10, (a4)
> + vaaddu.vv v10, v16, v10
> + vse8.v v10, (a4)
> + add a4, a4, a2
> + vnclipu.wi v10, v12, 6
> + vle8.v v11, (a4)
> + vwmulu.vx v12, v8, a6
> + vaaddu.vv v10, v10, v11
> + vwmaccu.vx v12, a7, v9
> + vse8.v v10, (a4)
> + add a4, a4, a2
> + vnclipu.wi v10, v14, 6
> + vle8.v v8, (a4)
> + vaaddu.vv v8, v10, v8
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v12, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + blt t2, a3, .LBB1_13
> + j .LBB1_17
> +.LBB1_14:
> + blez a3, .LBB1_17
> + li a4, 0
> + li t0, 0
> + slli a7, a2, 2
> +.LBB1_16: # the final else, none of the above
> conditions are met + add a5, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + addiw t0, t0, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + add t1, a5, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a5)
> + add a5, t1, a2
> + vle8.v v9, (t1)
> + vle8.v v12, (a5)
> + add a5, a0, a4
> + add a4, a4, a7
> + vnclipu.wi v13, v10, 6
> + vle8.v v10, (a5)
> + vwmulu.vx v14, v8, a6
> + vaaddu.vv v10, v13, v10
> + vse8.v v10, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v14, 6
> + vle8.v v10, (a5)
> + vaaddu.vv v8, v8, v10
> + vwmulu.vx v10, v9, a6
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vle8.v v9, (a5)
> + vwmulu.vx v10, v12, a6
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vle8.v v9, (a5)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a5)
> + blt t0, a3, .LBB1_16
> +.LBB1_17: # Exit h264_avg_chroma_mc8_rvv
> + ret
> +.Lfunc_end1:
> + .size h264_avg_chroma_mc8_rvv, .Lfunc_end1-h264_avg_chroma_mc8_rvv
--
Реми Дёни-Курмон
http://www.remlab.net/
More information about the ffmpeg-devel
mailing list