[FFmpeg-devel] [PATCH 2/2] lavc/vvc_mc: R-V V dmvr
Rémi Denis-Courmont
remi at remlab.net
Sat Sep 28 09:49:09 EEST 2024
Hi,
Le perjantaina 27. syyskuuta 2024, 20.09.30 EEST uk7b at foxmail.com a écrit :
> From: sunyuechi <sunyuechi at iscas.ac.cn>
>
> k230 banana_f3
> dmvr_8_12x20_c: 628.5 ( 1.00x) 624.1 ( 1.00x)
> dmvr_8_12x20_rvv_i32: 137.5 ( 4.57x) 92.9 ( 6.72x)
> dmvr_8_20x12_c: 609.7 ( 1.00x) 655.4 ( 1.00x)
> dmvr_8_20x12_rvv_i32: 146.7 ( 4.16x) 82.4 ( 7.95x)
> dmvr_8_20x20_c: 998.7 ( 1.00x) 1092.9 ( 1.00x)
> dmvr_8_20x20_rvv_i32: 221.0 ( 4.52x) 144.9 ( 7.54x)
> dmvr_h_8_12x20_c: 2008.0 ( 1.00x) 1999.2 ( 1.00x)
> dmvr_h_8_12x20_rvv_i32: 285.7 ( 7.03x) 207.4 ( 9.64x)
> dmvr_h_8_20x12_c: 1989.5 ( 1.00x) 2009.7 ( 1.00x)
> dmvr_h_8_20x12_rvv_i32: 322.7 ( 6.16x) 176.2 (11.41x)
> dmvr_h_8_20x20_c: 3304.2 ( 1.00x) 3342.9 ( 1.00x)
> dmvr_h_8_20x20_rvv_i32: 526.5 ( 6.28x) 290.6 (11.50x)
> dmvr_hv_8_12x20_c: 3609.7 ( 1.00x) 3603.4 ( 1.00x)
> dmvr_hv_8_12x20_rvv_i32: 554.2 ( 6.51x) 467.9 ( 7.70x)
> dmvr_hv_8_20x12_c: 3637.5 ( 1.00x) 3624.4 ( 1.00x)
> dmvr_hv_8_20x12_rvv_i32: 489.5 ( 7.43x) 342.6 (10.58x)
> dmvr_hv_8_20x20_c: 6794.7 ( 1.00x) 5936.9 ( 1.00x)
> dmvr_hv_8_20x20_rvv_i32: 785.7 ( 8.65x) 561.4 (10.58x)
> dmvr_v_8_12x20_c: 2156.0 ( 1.00x) 2155.2 ( 1.00x)
> dmvr_v_8_12x20_rvv_i32: 295.0 ( 7.31x) 207.4 (10.39x)
> dmvr_v_8_20x12_c: 2137.5 ( 1.00x) 2165.7 ( 1.00x)
> dmvr_v_8_20x12_rvv_i32: 322.7 ( 6.62x) 186.7 (11.60x)
> dmvr_v_8_20x20_c: 3554.2 ( 1.00x) 3593.2 ( 1.00x)
> dmvr_v_8_20x20_rvv_i32: 535.7 ( 6.63x) 290.6 (12.36x)
> ---
> libavcodec/riscv/vvc/vvc_mc_rvv.S | 141 +++++++++++++++++++++++++++++
> libavcodec/riscv/vvc/vvcdsp_init.c | 22 +++++
> 2 files changed, 163 insertions(+)
>
> diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> b/libavcodec/riscv/vvc/vvc_mc_rvv.S index 18532616d9..a5e20cbc67 100644
> --- a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> @@ -285,3 +285,144 @@ endfunc
> func_w_avg 128
> func_w_avg 256
> #endif
> +
> +func dmvr zve32x, zbb, zba
> + lpad 0
> + li t0, 4
> +1:
> + add t1, a1, a2
> + addi t4, a0, 128*2
> + add t2, t1, a2
> + addi t5, a0, 128*2*2
> + add t3, t2, a2
> + addi t6, a0, 128*2*3
> + vle8.v v0, (a1)
> + vle8.v v4, (t1)
> + vle8.v v8, (t2)
> + vle8.v v12, (t3)
> + addi a3, a3, -4
> + vwmulu.vx v16, v0, t0
> + vwmulu.vx v20, v4, t0
> + vwmulu.vx v24, v8, t0
> + vwmulu.vx v28, v12, t0
> + vse16.v v16, (a0)
> + vse16.v v20, (t4)
> + vse16.v v24, (t5)
> + vse16.v v28, (t6)
> + sh2add a1, a2, a1
> + add a0, a0, 128*2*4
> + bnez a3, 1b
> + ret
> +endfunc
Is 4x unroll really faster than 2x here? We don't typically unroll 4x
manually.
> +
> +.macro dmvr_h_v mn, type
> + lla t4, ff_vvc_inter_luma_dmvr_filters
> + sh1add t4, \mn, t4
> + lbu t5, (t4)
> + lbu t6, 1(t4)
> +1:
> +.ifc \type,h
> + addi t0, a1, 1
> + addi t1, a1, 2
> +.else
> + add t0, a1, a2
> + add t1, t0, a2
> +.endif
> + vle8.v v0, (a1)
> + vle8.v v4, (t0)
> + vle8.v v8, (t1)
> + addi a3, a3, -2
> + vzext.vf2 v12, v0
> + vzext.vf2 v16, v4
> + vzext.vf2 v20, v8
> + addi t2, a0, 128*2
> + vmul.vx v12, v12, t5
t5 seems to be 8-bit, so vwmulu.vx should work better here? Since you
leveraged it in the previous function, I'm a bit confused why not here, TBH.
> + vmul.vx v24, v16, t5
> + vmacc.vx v12, t6, v16
> + vmacc.vx v24, t6, v20
Likewise vwmaccu.vx.
> + vadd.vi v12, v12, 2
> + vadd.vi v24, v24, 2
> + vsra.vi v12, v12, 2
> + vsra.vi v24, v24, 2
Missing rounding opportunity, vssra.vi should work better here.
> + vse16.v v12, (a0)
> + vse16.v v24, (t2)
> + add a0, a0, 128*4
> + sh1add a1, a2, a1
> + bnez a3, 1b
> + ret
> +.endm
> +
> +func dmvr_h zve32x, zbb, zba
> + lpad 0
> + dmvr_h_v a4, h
> +endfunc
> +
> +func dmvr_v zve32x, zbb, zba
> + lpad 0
> + dmvr_h_v a5, v
> +endfunc
> +
> +.macro dmvr_load_h dst, filter0, filter1
> + addi a6, a1, 1
> + vle8.v \dst, (a1)
> + vle8.v v2, (a6)
> + vzext.vf2 v4, \dst
> + vzext.vf2 v8, v2
> + vmul.vx \dst, v4, \filter0
> + vmacc.vx \dst, \filter1, v8
> + vadd.vi \dst, \dst, 1 << (2 - 1)
> + vsra.vi \dst, \dst, 2
Same comments.
> +.endm
> +
> +func dmvr_hv zve32x, zbb, zba
> + lpad 0
> + lla t0, ff_vvc_inter_luma_dmvr_filters
> + sh1add t1, a4, t0
> + sh1add t2, a5, t0
> + lbu t3, (t1) // filter[mx][0]
> + lbu t4, 1(t1) // filter[mx][1]
> + lbu t5, (t2) // filter[my][0]
> + lbu t6, 1(t2) // filter[my][1]
> + dmvr_load_h v12, t3, t4
> + add a1, a1, a2
> +1:
> + vmul.vx v28, v12, t5
> + addi a3, a3, -1
> + dmvr_load_h v12, t3, t4
> + vmacc.vx v28, t6, v12
> + vadd.vi v28, v28, 1 << (4 - 1)
> + vsra.vi v28, v28, 4
Ditto.
> + vse16.v v28, (a0)
> + add a1, a1, a2
> + addi a0, a0, 128*2
> + bnez a3, 1b
> + ret
> +endfunc
> +
> +.macro func_dmvr vlen, name
> +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x, zbb, zba
> + lpad 0
> + li t0, 20
> + beq a6, t0, DMVR20\vlen\name
> + .ifc \name, dmvr
> + vsetvlstatic8 12, \vlen
> + .else
> + vsetvlstatic16 12, \vlen
> + .endif
> + j \name
> +DMVR20\vlen\name:
> + .ifc \name, dmvr
> + vsetvlstatic8 20, \vlen
> + .else
> + vsetvlstatic16 20, \vlen
> + .endif
> + j \name
> +endfunc
> +.endm
> +
> +.irp vlen,256,128
> +func_dmvr \vlen, dmvr
> +func_dmvr \vlen, dmvr_h
> +func_dmvr \vlen, dmvr_v
> +func_dmvr \vlen, dmvr_hv
> +.endr
> diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> b/libavcodec/riscv/vvc/vvcdsp_init.c index ac1e7dda7d..7df3ce58db 100644
> --- a/libavcodec/riscv/vvc/vvcdsp_init.c
> +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> @@ -37,6 +37,26 @@ void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> dst_stride, AVG_PROTOTYPES(8, rvv_128)
> AVG_PROTOTYPES(8, rvv_256)
>
> +#define DMVR_PROTOTYPES(bd, opt)
> \ +void ff_vvc_dmvr_##bd##_##opt(int16_t *dst,
> const uint8_t *src, ptrdiff_t src_stride, \ + int height,
> intptr_t mx, intptr_t my, int width);
> \ +void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src,
> ptrdiff_t src_stride, \ + int height, intptr_t mx, intptr_t
> my, int width); \ +void
> ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t
> src_stride, \ + int height, intptr_t mx, intptr_t my, int
> width); \ +void
> ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t
> src_stride, \ + int height, intptr_t mx, intptr_t my, int
> width); \ +
> +DMVR_PROTOTYPES(8, rvv_128)
> +DMVR_PROTOTYPES(8, rvv_256)
> +
> +#define DMVR_INIT(bd, opt) do { \
> + c->inter.dmvr[0][0] = ff_vvc_dmvr_##bd##_##opt; \
> + c->inter.dmvr[0][1] = ff_vvc_dmvr_h_##bd##_##opt; \
> + c->inter.dmvr[1][0] = ff_vvc_dmvr_v_##bd##_##opt; \
> + c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_##opt; \
> +} while (0)
> +
> void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> {
> #if HAVE_RVV
> @@ -51,6 +71,7 @@ void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const
> int bd) # if (__riscv_xlen == 64)
> c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
> # endif
> + DMVR_INIT(8, rvv_256);
> break;
> default:
> break;
> @@ -63,6 +84,7 @@ void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const
> int bd) # if (__riscv_xlen == 64)
> c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
> # endif
> + DMVR_INIT(8, rvv_128);
> break;
> default:
> break;
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
More information about the ffmpeg-devel
mailing list