[FFmpeg-devel] [PATCH 2/2] lavc/vvc_mc: R-V V dmvr

Sat Sep 28 19:48:03 EEST 2024

> At similar speed, shorter code is better.

Okay, updated it.

> Sure but so what? vsetvli/vsetivli is pretty fast (unlike vsetvl), and in
this case the code would be shorter. Or are you trying to factor the code
for different VTYPEs?

I mistakenly thought these vsets would slow things down.. after updating,
it has indeed become faster.

Rémi Denis-Courmont <remi at remlab.net> 于2024年9月28日周六 21:49写道：

>
>
> Le 28 septembre 2024 12:42:37 GMT+03:00, flow gg <hlefthleft at gmail.com> a
> écrit :
> >> Is 4x unroll really faster than 2x here? We don't typically unroll 4x
> >> manually.
> >
> >I first did 2x and then changed it to 4x. The test results are similar,
> and
> >I'm not sure how to choose between them...
>
> At similar speed, shorter code is better.
>
> >> t5 seems to be 8-bit, so vwmulu.vx should work better here? Since you
> >> leveraged it in the previous function, I'm a bit confused why not here,
> >TBH.
> >> Likewise vwmaccu.vx.
> >
> >DMVR doesn't have right shifts, but DMVR_h, _v, and _hv do.
> >So DMVR only needs one vset, while the others, if using widen, require
> vset
> >switching.
>
> Sure but so what? vsetvli/vsetivli is pretty fast (unlike vsetvl), and in
> this case the code would be shorter. Or are you trying to factor the code
> for different VTYPEs?
>
> >> Missing rounding opportunity, vssra.vi should work better here.
> >> Same comments.
> >
> >Okay, Updated it.
> >
> >Rémi Denis-Courmont <remi at remlab.net> 于2024年9月28日周六 14:56写道：
> >
> >> Hi,
> >>
> >> Le perjantaina 27. syyskuuta 2024, 20.09.30 EEST uk7b at foxmail.com a
> écrit
> >> :
> >> > From: sunyuechi <sunyuechi at iscas.ac.cn>
> >> >
> >> >                                      k230               banana_f3
> >> > dmvr_8_12x20_c:                       628.5 ( 1.00x)    624.1 ( 1.00x)
> >> > dmvr_8_12x20_rvv_i32:                 137.5 ( 4.57x)    92.9 ( 6.72x)
> >> > dmvr_8_20x12_c:                       609.7 ( 1.00x)    655.4 ( 1.00x)
> >> > dmvr_8_20x12_rvv_i32:                 146.7 ( 4.16x)    82.4 ( 7.95x)
> >> > dmvr_8_20x20_c:                       998.7 ( 1.00x)    1092.9 (
> 1.00x)
> >> > dmvr_8_20x20_rvv_i32:                 221.0 ( 4.52x)    144.9 ( 7.54x)
> >> > dmvr_h_8_12x20_c:                    2008.0 ( 1.00x)    1999.2 (
> 1.00x)
> >> > dmvr_h_8_12x20_rvv_i32:               285.7 ( 7.03x)    207.4 ( 9.64x)
> >> > dmvr_h_8_20x12_c:                    1989.5 ( 1.00x)    2009.7 (
> 1.00x)
> >> > dmvr_h_8_20x12_rvv_i32:               322.7 ( 6.16x)    176.2 (11.41x)
> >> > dmvr_h_8_20x20_c:                    3304.2 ( 1.00x)    3342.9 (
> 1.00x)
> >> > dmvr_h_8_20x20_rvv_i32:               526.5 ( 6.28x)    290.6 (11.50x)
> >> > dmvr_hv_8_12x20_c:                   3609.7 ( 1.00x)    3603.4 (
> 1.00x)
> >> > dmvr_hv_8_12x20_rvv_i32:              554.2 ( 6.51x)    467.9 ( 7.70x)
> >> > dmvr_hv_8_20x12_c:                   3637.5 ( 1.00x)    3624.4 (
> 1.00x)
> >> > dmvr_hv_8_20x12_rvv_i32:              489.5 ( 7.43x)    342.6 (10.58x)
> >> > dmvr_hv_8_20x20_c:                   6794.7 ( 1.00x)    5936.9 (
> 1.00x)
> >> > dmvr_hv_8_20x20_rvv_i32:              785.7 ( 8.65x)    561.4 (10.58x)
> >> > dmvr_v_8_12x20_c:                    2156.0 ( 1.00x)    2155.2 (
> 1.00x)
> >> > dmvr_v_8_12x20_rvv_i32:               295.0 ( 7.31x)    207.4 (10.39x)
> >> > dmvr_v_8_20x12_c:                    2137.5 ( 1.00x)    2165.7 (
> 1.00x)
> >> > dmvr_v_8_20x12_rvv_i32:               322.7 ( 6.62x)    186.7 (11.60x)
> >> > dmvr_v_8_20x20_c:                    3554.2 ( 1.00x)    3593.2 (
> 1.00x)
> >> > dmvr_v_8_20x20_rvv_i32:               535.7 ( 6.63x)    290.6 (12.36x)
> >> > ---
> >> >  libavcodec/riscv/vvc/vvc_mc_rvv.S  | 141
> +++++++++++++++++++++++++++++
> >> >  libavcodec/riscv/vvc/vvcdsp_init.c |  22 +++++
> >> >  2 files changed, 163 insertions(+)
> >> >
> >> > diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> >> > b/libavcodec/riscv/vvc/vvc_mc_rvv.S index 18532616d9..a5e20cbc67
> 100644
> >> > --- a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> >> > +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> >> > @@ -285,3 +285,144 @@ endfunc
> >> >  func_w_avg 128
> >> >  func_w_avg 256
> >> >  #endif
> >> > +
> >> > +func dmvr zve32x, zbb, zba
> >> > +        lpad    0
> >> > +        li                t0, 4
> >> > +1:
> >> > +        add               t1, a1, a2
> >> > +        addi              t4, a0, 128*2
> >> > +        add               t2, t1, a2
> >> > +        addi              t5, a0, 128*2*2
> >> > +        add               t3, t2, a2
> >> > +        addi              t6, a0, 128*2*3
> >> > +        vle8.v            v0, (a1)
> >> > +        vle8.v            v4, (t1)
> >> > +        vle8.v            v8, (t2)
> >> > +        vle8.v            v12, (t3)
> >> > +        addi              a3, a3, -4
> >> > +        vwmulu.vx         v16, v0, t0
> >> > +        vwmulu.vx         v20, v4, t0
> >> > +        vwmulu.vx         v24, v8, t0
> >> > +        vwmulu.vx         v28, v12, t0
> >> > +        vse16.v           v16, (a0)
> >> > +        vse16.v           v20, (t4)
> >> > +        vse16.v           v24, (t5)
> >> > +        vse16.v           v28, (t6)
> >> > +        sh2add            a1, a2, a1
> >> > +        add               a0, a0, 128*2*4
> >> > +        bnez              a3, 1b
> >> > +        ret
> >> > +endfunc
> >>
> >> Is 4x unroll really faster than 2x here? We don't typically unroll 4x
> >> manually.
> >>
> >> > +
> >> > +.macro dmvr_h_v mn, type
> >> > +        lla               t4, ff_vvc_inter_luma_dmvr_filters
> >> > +        sh1add            t4, \mn, t4
> >> > +        lbu               t5, (t4)
> >> > +        lbu               t6, 1(t4)
> >> > +1:
> >> > +.ifc \type,h
> >> > +        addi              t0, a1, 1
> >> > +        addi              t1, a1, 2
> >> > +.else
> >> > +        add               t0, a1, a2
> >> > +        add               t1, t0, a2
> >> > +.endif
> >> > +        vle8.v            v0, (a1)
> >> > +        vle8.v            v4, (t0)
> >> > +        vle8.v            v8, (t1)
> >> > +        addi              a3, a3, -2
> >> > +        vzext.vf2         v12, v0
> >> > +        vzext.vf2         v16, v4
> >> > +        vzext.vf2         v20, v8
> >> > +        addi              t2, a0, 128*2
> >> > +        vmul.vx           v12, v12, t5
> >>
> >> t5 seems to be 8-bit, so vwmulu.vx should work better here? Since you
> >> leveraged it in the previous function, I'm a bit confused why not here,
> >> TBH.
> >>
> >> > +        vmul.vx           v24, v16, t5
> >> > +        vmacc.vx          v12, t6, v16
> >> > +        vmacc.vx          v24, t6, v20
> >>
> >> Likewise vwmaccu.vx.
> >>
> >> > +        vadd.vi           v12, v12, 2
> >> > +        vadd.vi           v24, v24, 2
> >> > +        vsra.vi           v12, v12, 2
> >> > +        vsra.vi           v24, v24, 2
> >>
> >> Missing rounding opportunity, vssra.vi should work better here.
> >>
> >> > +        vse16.v           v12, (a0)
> >> > +        vse16.v           v24, (t2)
> >> > +        add               a0, a0, 128*4
> >> > +        sh1add            a1, a2, a1
> >> > +        bnez              a3, 1b
> >> > +        ret
> >> > +.endm
> >> > +
> >> > +func dmvr_h zve32x, zbb, zba
> >> > +        lpad    0
> >> > +        dmvr_h_v a4, h
> >> > +endfunc
> >> > +
> >> > +func dmvr_v zve32x, zbb, zba
> >> > +        lpad    0
> >> > +        dmvr_h_v a5, v
> >> > +endfunc
> >> > +
> >> > +.macro dmvr_load_h dst, filter0, filter1
> >> > +        addi            a6, a1, 1
> >> > +        vle8.v          \dst, (a1)
> >> > +        vle8.v          v2, (a6)
> >> > +        vzext.vf2       v4, \dst
> >> > +        vzext.vf2       v8, v2
> >> > +        vmul.vx         \dst, v4, \filter0
> >> > +        vmacc.vx        \dst, \filter1, v8
> >> > +        vadd.vi         \dst, \dst, 1 << (2 - 1)
> >> > +        vsra.vi         \dst, \dst, 2
> >>
> >> Same comments.
> >>
> >> > +.endm
> >> > +
> >> > +func dmvr_hv zve32x, zbb, zba
> >> > +        lpad    0
> >> > +        lla             t0, ff_vvc_inter_luma_dmvr_filters
> >> > +        sh1add          t1, a4, t0
> >> > +        sh1add          t2, a5, t0
> >> > +        lbu             t3, (t1)          // filter[mx][0]
> >> > +        lbu             t4, 1(t1)         // filter[mx][1]
> >> > +        lbu             t5, (t2)          // filter[my][0]
> >> > +        lbu             t6, 1(t2)         // filter[my][1]
> >> > +        dmvr_load_h     v12, t3, t4
> >> > +        add             a1, a1, a2
> >> > +1:
> >> > +        vmul.vx         v28, v12, t5
> >> > +        addi            a3, a3, -1
> >> > +        dmvr_load_h     v12, t3, t4
> >> > +        vmacc.vx        v28, t6, v12
> >> > +        vadd.vi         v28, v28, 1 << (4 - 1)
> >> > +        vsra.vi         v28, v28, 4
> >>
> >> Ditto.
> >>
> >> > +        vse16.v         v28, (a0)
> >> > +        add             a1, a1, a2
> >> > +        addi            a0, a0, 128*2
> >> > +        bnez            a3, 1b
> >> > +        ret
> >> > +endfunc
> >> > +
> >> > +.macro func_dmvr vlen, name
> >> > +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x, zbb, zba
> >> > +        lpad    0
> >> > +        li               t0, 20
> >> > +        beq              a6, t0, DMVR20\vlen\name
> >> > +        .ifc \name, dmvr
> >> > +        vsetvlstatic8    12, \vlen
> >> > +        .else
> >> > +        vsetvlstatic16   12, \vlen
> >> > +        .endif
> >> > +        j                \name
> >> > +DMVR20\vlen\name:
> >> > +        .ifc \name, dmvr
> >> > +        vsetvlstatic8    20, \vlen
> >> > +        .else
> >> > +        vsetvlstatic16   20, \vlen
> >> > +        .endif
> >> > +        j                \name
> >> > +endfunc
> >> > +.endm
> >> > +
> >> > +.irp vlen,256,128
> >> > +func_dmvr \vlen, dmvr
> >> > +func_dmvr \vlen, dmvr_h
> >> > +func_dmvr \vlen, dmvr_v
> >> > +func_dmvr \vlen, dmvr_hv
> >> > +.endr
> >> > diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> >> > b/libavcodec/riscv/vvc/vvcdsp_init.c index ac1e7dda7d..7df3ce58db
> 100644
> >> > --- a/libavcodec/riscv/vvc/vvcdsp_init.c
> >> > +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> >> > @@ -37,6 +37,26 @@ void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst,
> ptrdiff_t
> >> > dst_stride, AVG_PROTOTYPES(8, rvv_128)
> >> >  AVG_PROTOTYPES(8, rvv_256)
> >> >
> >> > +#define DMVR_PROTOTYPES(bd, opt)
> >>
> >> >                         \ +void ff_vvc_dmvr_##bd##_##opt(int16_t *dst,
> >> > const uint8_t *src, ptrdiff_t src_stride,               \ +     int
> >> height,
> >> > intptr_t mx, intptr_t my, int width);
> >>
> >> >        \ +void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t
> >> *src,
> >> > ptrdiff_t src_stride,             \ +     int height, intptr_t mx,
> >> intptr_t
> >> > my, int width);                                              \ +void
> >> > ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t
> >> > src_stride,             \ +     int height, intptr_t mx, intptr_t my,
> int
> >> > width);                                              \ +void
> >> > ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src,
> ptrdiff_t
> >> > src_stride,            \ +     int height, intptr_t mx, intptr_t my,
> int
> >> > width);                                              \ +
> >> > +DMVR_PROTOTYPES(8, rvv_128)
> >> > +DMVR_PROTOTYPES(8, rvv_256)
> >> > +
> >> > +#define DMVR_INIT(bd, opt) do {                                    \
> >> > +    c->inter.dmvr[0][0]   = ff_vvc_dmvr_##bd##_##opt;              \
> >> > +    c->inter.dmvr[0][1]   = ff_vvc_dmvr_h_##bd##_##opt;            \
> >> > +    c->inter.dmvr[1][0]   = ff_vvc_dmvr_v_##bd##_##opt;            \
> >> > +    c->inter.dmvr[1][1]   = ff_vvc_dmvr_hv_##bd##_##opt;           \
> >> > +} while (0)
> >> > +
> >> >  void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> >> >  {
> >> >  #if HAVE_RVV
> >> > @@ -51,6 +71,7 @@ void ff_vvc_dsp_init_riscv(VVCDSPContext *const c,
> >> const
> >> > int bd) # if (__riscv_xlen == 64)
> >> >                  c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
> >> >  # endif
> >> > +                DMVR_INIT(8, rvv_256);
> >> >                  break;
> >> >              default:
> >> >                  break;
> >> > @@ -63,6 +84,7 @@ void ff_vvc_dsp_init_riscv(VVCDSPContext *const c,
> >> const
> >> > int bd) # if (__riscv_xlen == 64)
> >> >                  c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
> >> >  # endif
> >> > +                DMVR_INIT(8, rvv_128);
> >> >                  break;
> >> >              default:
> >> >                  break;
> >>
> >>
> >> --
> >> 雷米‧德尼-库尔蒙
> >> http://www.remlab.net/
> >>
> >>
> >>
> >> _______________________________________________
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel at ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
> >>
> >_______________________________________________
> >ffmpeg-devel mailing list
> >ffmpeg-devel at ffmpeg.org
> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> >To unsubscribe, visit link above, or email
> >ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>