[FFmpeg-devel] [PATCH 2/2] lavc/rv40dsp: fix RISC-V chroma_mc
flow gg
hlefthleft at gmail.com
Tue Nov 19 11:12:48 EET 2024
Please ignore this
<uk7b at foxmail.com> 于2024年11月19日周二 17:08写道:
> From: sunyuechi <sunyuechi at iscas.ac.cn>
>
> ---
> libavcodec/riscv/rv40dsp_rvv.S | 111 ++++++++++++++++++++++-----------
> 1 file changed, 73 insertions(+), 38 deletions(-)
>
> diff --git a/libavcodec/riscv/rv40dsp_rvv.S
> b/libavcodec/riscv/rv40dsp_rvv.S
> index ca431eb8ab..a1b2d6d1df 100644
> --- a/libavcodec/riscv/rv40dsp_rvv.S
> +++ b/libavcodec/riscv/rv40dsp_rvv.S
> @@ -20,15 +20,27 @@
>
> #include "libavutil/riscv/asm.S"
>
> -.macro manual_avg dst src1 src2
> - vadd.vv \dst, \src1, \src2
> - vadd.vi \dst, \dst, 1
> - vsrl.vi \dst, \dst, 1
> -.endm
> +const rv40_bias
> + .byte 0, 16, 32, 16
> + .byte 32, 28, 32, 28
> + .byte 0, 32, 16, 32
> + .byte 32, 28, 32, 28
> +endconst
>
> .macro do_chroma_mc type unroll
> - csrwi vxrm, 2
> + csrwi vxrm, 0
> +#if __riscv_xlen == 64
> + addi sp, sp, -8
> +#else
> + addi sp, sp, -4
> +#endif
> + lla t4, rv40_bias
> + sd s0, (sp)
> + srli t5, a4, 1
> + sh1add t4, a5, t4
> + add t5, t4, t5
> slli t2, a5, 3
> + lb s0, (t5)
> mul t1, a5, a4
> sh3add a5, a4, t2
> slli a4, a4, 3
> @@ -80,17 +92,19 @@
> vwmulu.vx v12, v14, a6
> vwmaccu.vx v10, t1, v15
> vwmaccu.vx v12, a7, v15
> - vnclipu.wi v15, v8, 6
> + vwaddu.wx v20, v8, s0
> + vnsrl.wi v15, v20, 6
> .ifc \type,avg
> vle8.v v9, (a0)
> - manual_avg v15, v15, v9
> + vaaddu.vv v15, v15, v9
> .endif
> vse8.v v15, (a0)
> add a0, a0, a2
> - vnclipu.wi v8, v10, 6
> + vwaddu.wx v20, v10, s0
> + vnsrl.wi v8, v20, 6
> .ifc \type,avg
> vle8.v v9, (a0)
> - manual_avg v8, v8, v9
> + vaaddu.vv v8, v8, v9
> .endif
> add t4, t4, t3
> vse8.v v8, (a0)
> @@ -115,17 +129,19 @@
> vslide1down.vx v14, v14, t5
> vsetvli zero, t6, e8, m1, ta, ma
> vwmaccu.vx v16, t1, v14
> - vnclipu.wi v8, v12, 6
> + vwaddu.wx v20, v12, s0
> + vnsrl.wi v8, v20, 6
> .ifc \type,avg
> vle8.v v9, (a0)
> - manual_avg v8, v8, v9
> + vaaddu.vv v8, v8, v9
> .endif
> vse8.v v8, (a0)
> add a0, a0, a2
> - vnclipu.wi v8, v16, 6
> + vwaddu.wx v20, v16, s0
> + vnsrl.wi v8, v20, 6
> .ifc \type,avg
> vle8.v v9, (a0)
> - manual_avg v8, v8, v9
> + vaaddu.vv v8, v8, v9
> .endif
> vse8.v v8, (a0)
> add a0, a0, a2
> @@ -159,18 +175,20 @@
> vwmaccu.vx v10, t0, v8
> add a4, a4, a7
> vwmaccu.vx v12, t0, v9
> - vnclipu.wi v15, v10, 6
> + vwaddu.wx v20, v10, s0
> + vnsrl.wi v15, v20, 6
> vwmulu.vx v10, v9, a6
> - vnclipu.wi v9, v12, 6
> + vwaddu.wx v20, v12, s0
> + vnsrl.wi v9, v20, 6
> .ifc \type,avg
> vle8.v v16, (a0)
> - manual_avg v15, v15, v16
> + vaaddu.vv v15, v15, v16
> .endif
> vse8.v v15, (a0)
> add a0, a0, a2
> .ifc \type,avg
> vle8.v v16, (a0)
> - manual_avg v9, v9, v16
> + vaaddu.vv v9, v9, v16
> .endif
> vse8.v v9, (a0)
> add a0, a0, a2
> @@ -179,18 +197,20 @@
> vle8.v v14, (a5)
> vwmaccu.vx v10, t0, v8
> vwmulu.vx v12, v8, a6
> - vnclipu.wi v8, v10, 6
> + vwaddu.wx v20, v10, s0
> + vnsrl.wi v8, v20, 6
> vwmaccu.vx v12, t0, v14
> .ifc \type,avg
> vle8.v v16, (a0)
> - manual_avg v8, v8, v16
> + vaaddu.vv v8, v8, v16
> .endif
> vse8.v v8, (a0)
> add a0, a0, a2
> - vnclipu.wi v8, v12, 6
> + vwaddu.wx v20, v12, s0
> + vnsrl.wi v8, v20, 6
> .ifc \type,avg
> vle8.v v16, (a0)
> - manual_avg v8, v8, v16
> + vaaddu.vv v8, v8, v16
> .endif
> vse8.v v8, (a0)
> add a0, a0, a2
> @@ -226,17 +246,19 @@
> vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v12, v8, a6
> vwmaccu.vx v12, a7, v9
> - vnclipu.wi v16, v10, 6
> + vwaddu.wx v20, v10, s0
> + vnsrl.wi v16, v20, 6
> .ifc \type,avg
> vle8.v v18, (a0)
> - manual_avg v16, v16, v18
> + vaaddu.vv v16, v16, v18
> .endif
> vse8.v v16, (a0)
> add a0, a0, a2
> - vnclipu.wi v10, v12, 6
> + vwaddu.wx v20, v12, s0
> + vnsrl.wi v10, v20, 6
> .ifc \type,avg
> vle8.v v18, (a0)
> - manual_avg v10, v10, v18
> + vaaddu.vv v10, v10, v18
> .endif
> add a4, a4, t1
> vse8.v v10, (a0)
> @@ -254,18 +276,20 @@
> vslide1down.vx v9, v8, t5
> vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v12, v8, a6
> - vnclipu.wi v8, v14, 6
> + vwaddu.wx v20, v14, s0
> + vnsrl.wi v8, v20, 6
> vwmaccu.vx v12, a7, v9
> .ifc \type,avg
> vle8.v v18, (a0)
> - manual_avg v8, v8, v18
> + vaaddu.vv v8, v8, v18
> .endif
> vse8.v v8, (a0)
> add a0, a0, a2
> - vnclipu.wi v8, v12, 6
> + vwaddu.wx v20, v12, s0
> + vnsrl.wi v8, v20, 6
> .ifc \type,avg
> vle8.v v18, (a0)
> - manual_avg v8, v8, v18
> + vaaddu.vv v8, v8, v18
> .endif
> vse8.v v8, (a0)
> add a0, a0, a2
> @@ -293,18 +317,20 @@
> vwmulu.vx v10, v8, a6
> vle8.v v8, (t0)
> add t0, t1, a2
> - vnclipu.wi v13, v10, 6
> + vwaddu.wx v20, v10, s0
> + vnsrl.wi v13, v20, 6
> vwmulu.vx v10, v8, a6
> .ifc \type,avg
> vle8.v v18, (a5)
> - manual_avg v13, v13, v18
> + vaaddu.vv v13, v13, v18
> .endif
> vse8.v v13, (a5)
> add a5, a5, a2
> - vnclipu.wi v8, v10, 6
> + vwaddu.wx v20, v10, s0
> + vnsrl.wi v8, v20, 6
> .ifc \type,avg
> vle8.v v18, (a5)
> - manual_avg v8, v8, v18
> + vaaddu.vv v8, v8, v18
> .endif
> vse8.v v8, (a5)
> add a5, a5, a2
> @@ -312,23 +338,32 @@
> vle8.v v9, (t1)
> vle8.v v12, (t0)
> vwmulu.vx v10, v9, a6
> - vnclipu.wi v8, v10, 6
> + vwaddu.wx v20, v10, s0
> + vnsrl.wi v8, v20, 6
> vwmulu.vx v10, v12, a6
> .ifc \type,avg
> vle8.v v18, (a5)
> - manual_avg v8, v8, v18
> + vaaddu.vv v8, v8, v18
> .endif
> vse8.v v8, (a5)
> add a5, a5, a2
> - vnclipu.wi v8, v10, 6
> + vwaddu.wx v20, v10, s0
> + vnsrl.wi v8, v20, 6
> .ifc \type,avg
> vle8.v v18, (a5)
> - manual_avg v8, v8, v18
> + vaaddu.vv v8, v8, v18
> .endif
> vse8.v v8, (a5)
> .endif
> blt t2, a3, 7b
> 8:
> + ld s0, (sp)
> +#if __riscv_xlen == 64
> + addi sp, sp, 8
> +#else
> + addi sp, sp, 4
> +#endif
> +
> ret
> .endm
>
> --
> 2.47.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>
More information about the ffmpeg-devel
mailing list