[FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks

Wed Jun 14 18:57:50 EEST 2023

Le perjantaina 9. kesäkuuta 2023, 10.17.27 EEST Arnie Chang a écrit :
> Optimize the put and avg filtering for 4xH and 2xH blocks
> 
> Signed-off-by: Arnie Chang <arnie.chang at sifive.com>
> ---
> checkasm: using random seed 3475799765
> RVVi32:
>  - h264chroma.chroma_mc [OK]
> checkasm: all 6 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 466.5
> avg_h264_chroma_mc2_8_c: 939.2
> avg_h264_chroma_mc2_8_rvv_i32: 466.5
> avg_h264_chroma_mc4_8_c: 502.2
> avg_h264_chroma_mc4_8_rvv_i32: 466.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 382.5
> put_h264_chroma_mc2_8_c: 824.2
> put_h264_chroma_mc2_8_rvv_i32: 382.5
> put_h264_chroma_mc4_8_c: 431.2
> put_h264_chroma_mc4_8_rvv_i32: 382.5
> 
>  libavcodec/riscv/h264_chroma_init_riscv.c |   8 +
>  libavcodec/riscv/h264_mc_chroma.S         | 216 ++++++++++++++--------
>  2 files changed, 144 insertions(+), 80 deletions(-)
> 
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c
> b/libavcodec/riscv/h264_chroma_init_riscv.c index 7c905edfcd..9f95150ea3
> 100644
> --- a/libavcodec/riscv/h264_chroma_init_riscv.c
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -27,6 +27,10 @@
> 
>  void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y); void
> h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t
> stride, int h, int x, int y); +void h264_put_chroma_mc4_rvv(uint8_t *p_dst,
> const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void
> h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t
> stride, int h, int x, int y); +void h264_put_chroma_mc2_rvv(uint8_t *p_dst,
> const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void
> h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t
> stride, int h, int x, int y);
> 
>  av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
>  {
> @@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext
> *c, int bit_depth) if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_get_rv_vlenb() >= 16) { c->put_h264_chroma_pixels_tab[0] =
> h264_put_chroma_mc8_rvv; c->avg_h264_chroma_pixels_tab[0] =
> h264_avg_chroma_mc8_rvv; +        c->put_h264_chroma_pixels_tab[1] =
> h264_put_chroma_mc4_rvv; +        c->avg_h264_chroma_pixels_tab[1] =
> h264_avg_chroma_mc4_rvv; +        c->put_h264_chroma_pixels_tab[2] =
> h264_put_chroma_mc2_rvv; +        c->avg_h264_chroma_pixels_tab[2] =
> h264_avg_chroma_mc2_rvv; }
>  #endif
>  }
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S index 364bc3156e..c97cdbad86 100644
> --- a/libavcodec/riscv/h264_mc_chroma.S
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -19,8 +19,7 @@
>   */
>  #include "libavutil/riscv/asm.S"
> 
> -.macro  h264_chroma_mc8 type
> -func h264_\type\()_chroma_mc8_rvv, zve32x
> +.macro  do_chroma_mc type width unroll

It looks like \width is only ever used as AVL. You could advantageously pass 
it as a run-time argument to an internal function, and spare the instruction 
cache, instead of instantiating otherwise identical code thrice.

>          csrw            vxrm, zero
>          slli            t2, a5, 3
>          mul             t1, a5, a4
> @@ -30,94 +29,104 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
>          sub             a7, a4, t1
>          addi            a6, a5, 64
>          sub             t0, t2, t1
> -        vsetivli        t3, 8, e8, m1, ta, mu
> +        vsetivli        t3, \width, e8, m1, ta, mu
>          beqz            t1, 2f
>          blez            a3, 8f
>          li              t4, 0
>          li              t2, 0
>          li              t5, 1
>          addi            a5, t3, 1
> +  .ifc \unroll,1
>          slli            t3, a2, 2
> +  .else
> +        slli            t3, a2, 1
> +  .endif

Note that all those 5-line conditional shift blocks could be simplified by 
folding, e.g.:

    slli t3, a2, (1 + \unroll)

Though I wonder if we could leverage SH*ADD instructions in some cases instead 
of SLLI?

(..)

> +.endm
> +
> +.macro  h264_chroma_mc type width
> +func h264_\type\()_chroma_mc\width\()_rvv, zve32x
> +  .ifc \width,8
> +        do_chroma_mc \type 8 1
> +  .else
> +        li      a7, 3
> +        blt     a3, a7, 11f
> +        do_chroma_mc \type \width 1
> +11:
> +        do_chroma_mc \type \width 0
> +  .endif

-- 
Rémi Denis-Courmont
http://www.remlab.net/