[FFmpeg-devel] [PATCH v2] avcodec/riscv: add h264 dc idct rvv
Rémi Denis-Courmont
remi at remlab.net
Wed Jul 3 18:13:25 EEST 2024
Le keskiviikkona 3. heinäkuuta 2024, 13.47.29 EEST J. Dekker a écrit :
> checkasm: bench runs 131072 (1 << 17)
> h264_idct4_add_dc_8bpp_c: 1.5
> h264_idct4_add_dc_8bpp_rvv_i64: 0.7
> h264_idct4_add_dc_9bpp_c: 1.5
> h264_idct4_add_dc_9bpp_rvv_i64: 0.7
> h264_idct4_add_dc_10bpp_c: 1.5
> h264_idct4_add_dc_10bpp_rvv_i64: 0.7
> h264_idct4_add_dc_12bpp_c: 1.2
> h264_idct4_add_dc_12bpp_rvv_i64: 0.7
> h264_idct4_add_dc_14bpp_c: 1.2
> h264_idct4_add_dc_14bpp_rvv_i64: 0.7
> h264_idct8_add_dc_8bpp_c: 5.2
> h264_idct8_add_dc_8bpp_rvv_i64: 1.5
> h264_idct8_add_dc_9bpp_c: 5.5
> h264_idct8_add_dc_9bpp_rvv_i64: 1.2
> h264_idct8_add_dc_10bpp_c: 5.5
> h264_idct8_add_dc_10bpp_rvv_i64: 1.2
> h264_idct8_add_dc_12bpp_c: 4.2
> h264_idct8_add_dc_12bpp_rvv_i64: 1.2
> h264_idct8_add_dc_14bpp_c: 4.2
> h264_idct8_add_dc_14bpp_rvv_i64: 1.2
>
> Signed-off-by: J. Dekker <jdek at itanimul.li>
> ---
>
> rdcycle always returns 0 on my board, clock_gettime() seems as noisy as
> rdtime (just with bigger numbers).
On K230? Odd. Maybe vendor made some updates in later builds.
> libavcodec/riscv/Makefile | 1 +
> libavcodec/riscv/h264dsp_init.c | 42 +++++++-
> libavcodec/riscv/h264dsp_rvv.S | 176 ++++++++++++++++++++++++++++++++
> 3 files changed, 216 insertions(+), 3 deletions(-)
> create mode 100644 libavcodec/riscv/h264dsp_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index c180223141..a1510e8c6e 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
> OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
> RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
> OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
> +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o
> OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
> RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
> OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
> diff --git a/libavcodec/riscv/h264dsp_init.c
> b/libavcodec/riscv/h264dsp_init.c index dbbf3db400..8c77303ec6 100644
> --- a/libavcodec/riscv/h264dsp_init.c
> +++ b/libavcodec/riscv/h264dsp_init.c
> @@ -1,4 +1,5 @@
> /*
> + * Copyright (c) 2024 J. Dekker <jdek at itanimul.li>
> * Copyright © 2024 Rémi Denis-Courmont.
> *
> * This file is part of FFmpeg.
> @@ -24,22 +25,57 @@
>
> #include "libavutil/attributes.h"
> #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> #include "libavcodec/h264dsp.h"
>
> extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
> extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
> +void ff_h264_idct4_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_10_rvv(uint8_t *dst, int16_t *block, int
> stride); +void ff_h264_idct4_dc_add_12_rvv(uint8_t *dst, int16_t *block,
> int stride); +void ff_h264_idct8_dc_add_12_rvv(uint8_t *dst, int16_t
> *block, int stride); +void ff_h264_idct4_dc_add_14_rvv(uint8_t *dst,
> int16_t *block, int stride); +void ff_h264_idct8_dc_add_14_rvv(uint8_t
> *dst, int16_t *block, int stride);
>
> -av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int
> bit_depth, +av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int
> bit_depth, const int chroma_format_idc)
> {
> #if HAVE_RV
> int flags = av_get_cpu_flags();
>
> if (flags & AV_CPU_FLAG_RVB_BASIC)
> - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
> + c->startcode_find_candidate = ff_startcode_find_candidate_rvb;
> # if HAVE_RVV
> if (flags & AV_CPU_FLAG_RVV_I32)
> - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> + c->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> # endif
> + if ((flags & AV_CPU_FLAG_RVV_I64) && ff_rv_vlen_least(16)) {
The assembler below does not seem to require 64-bit elements for anything?
Also ff_rv_vlen_least() expects bits, not bytes.
> + switch(bit_depth) {
> + case 8:
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
> + break;
> + case 9:
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv;
> + break;
> + case 10:
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv;
> + break;
> + case 12:
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
> + break;
> + case 14:
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
> + break;
> + }
> + }
> #endif
> }
> diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
> new file mode 100644
> index 0000000000..57f0433f7c
> --- /dev/null
> +++ b/libavcodec/riscv/h264dsp_rvv.S
> @@ -0,0 +1,176 @@
> +/*
> + * SPDX-License-Identifier: BSD-2-Clause
> + *
> + * Copyright (c) 2024 J. Dekker <jdek at itanimul.li>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
> IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
> TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
> PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY
> THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT
> (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE
> OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro idct_dc_add8 width
> +func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
> + vsetivli zero, \width, e16, m1, ta, ma
mf2 should be faster if \width == 4.
> + lh a3, 0(a1)
> + addi a3, a3, 32
> + srai a3, a3, 6
> + sh zero, 0(a1)
> +.if \width == 8
> + vlsseg8e8.v v24, (a0), a2
> +.else
> + vlsseg4e8.v v24, (a0), a2
> +.endif
You could probably use vlse64.v or vlse32.v here, treating each row as an
element - and respectively vsse{64,32}.v at the end. You can then load a whole
8x8 or 4x4 matrix in a single vector group.
This should work fine given that this function does not need to identify rows
or columns. This is probably faster than using segments. It would also avoid
repeating each instruction 8 or 4 times below.
That should also work for the 16-bit 4x4 function. Unfortunately, it won't
work for 16-bit 8x8, as vlse128.v does not exist.
> + vzext.vf2 v0, v24
> + vzext.vf2 v2, v25
> + vzext.vf2 v4, v26
> + vzext.vf2 v6, v27
> +.if \width == 8
> + vzext.vf2 v10, v28
> + vzext.vf2 v12, v29
> + vzext.vf2 v14, v30
> + vzext.vf2 v16, v31
> +.endif
> + vadd.vx v0, v0, a3
> + vadd.vx v2, v2, a3
> + vadd.vx v4, v4, a3
> + vadd.vx v6, v6, a3
> +.if \width == 8
> + vadd.vx v10, v10, a3
> + vadd.vx v12, v12, a3
> + vadd.vx v14, v14, a3
> + vadd.vx v16, v16, a3
> +.endif
> + vmax.vx v0, v0, zero
> + vmax.vx v2, v2, zero
> + vmax.vx v4, v4, zero
> + vmax.vx v6, v6, zero
> +.if \width == 8
> + vmax.vx v10, v10, zero
> + vmax.vx v12, v12, zero
> + vmax.vx v14, v14, zero
> + vmax.vx v16, v16, zero
> +.endif
> + vsetvli zero, zero, e8, mf2, ta, ma
> + vnclipu.wi v24, v0, 0
> + vnclipu.wi v25, v2, 0
> + vnclipu.wi v26, v4, 0
> + vnclipu.wi v27, v6, 0
> +.if \width == 8
> + vnclipu.wi v28, v10, 0
> + vnclipu.wi v29, v12, 0
> + vnclipu.wi v30, v14, 0
> + vnclipu.wi v31, v16, 0
> + vssseg8e8.v v24, (a0), a2
> +.else
> + vssseg4e8.v v24, (a0), a2
> +.endif
> + ret
> +endfunc
> +.endm
> +
> +idct_dc_add8 4
> +idct_dc_add8 8
> +
> +.macro idct_dc_add width
> +func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
> + vsetivli zero, \width, e16, m1, ta, ma
> + lw a3, 0(a1)
> + addi a3, a3, 32
> + srai a3, a3, 6
> + sw zero, 0(a1)
> + add t4, a0, a2
> + sh1add t5, a2, a0
> + sh1add t6, a2, t4
> +.if \width == 8
> + sh2add t0, a2, a0
> + sh2add t1, a2, t4
> + sh2add t2, a2, t5
> + sh2add t3, a2, t6
> +.endif
> + vle16.v v0, (a0)
> + vle16.v v2, (t4)
> + vle16.v v4, (t5)
> + vle16.v v6, (t6)
> +.if \width == 8
> + vle16.v v10, (t0)
> + vle16.v v12, (t1)
> + vle16.v v14, (t2)
> + vle16.v v16, (t3)
> +.endif
> + vadd.vx v0, v0, a3
> + vadd.vx v2, v2, a3
> + vadd.vx v4, v4, a3
> + vadd.vx v6, v6, a3
> +.if \width == 8
> + vadd.vx v10, v10, a3
> + vadd.vx v12, v12, a3
> + vadd.vx v14, v14, a3
> + vadd.vx v16, v16, a3
> +.endif
> + vmax.vx v0, v0, zero
> + vmax.vx v2, v2, zero
> + vmax.vx v4, v4, zero
> + vmax.vx v6, v6, zero
> +.if \width == 8
> + vmax.vx v10, v10, zero
> + vmax.vx v12, v12, zero
> + vmax.vx v14, v14, zero
> + vmax.vx v16, v16, zero
> +.endif
> + vmin.vx v0, v0, a5
> + vmin.vx v2, v2, a5
> + vmin.vx v4, v4, a5
> + vmin.vx v6, v6, a5
> +.if \width == 8
> + vmin.vx v10, v10, a5
> + vmin.vx v12, v12, a5
> + vmin.vx v14, v14, a5
> + vmin.vx v16, v16, a5
> +.endif
> + vse16.v v0, (a0)
> + vse16.v v2, (t4)
> + vse16.v v4, (t5)
> + vse16.v v6, (t6)
> +.if \width == 8
> + vse16.v v10, (t0)
> + vse16.v v12, (t1)
> + vse16.v v14, (t2)
> + vse16.v v16, (t3)
> +.endif
> + ret
> +endfunc
> +.endm
> +
> +idct_dc_add 4
> +idct_dc_add 8
> +
> +.irp depth,9,10,12,14
> +func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
> + li a5, (1 << \depth) - 1
> + j ff_h264_idct4_dc_add_16_rvv
> +endfunc
> +
> +func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
> + li a5, (1 << \depth) - 1
> + j ff_h264_idct8_dc_add_16_rvv
> +endfunc
> +.endr
--
レミ・デニ-クールモン
http://www.remlab.net/
More information about the ffmpeg-devel
mailing list