[FFmpeg-devel] [PATCH 2/2] lavc/lpc: R-V V compute_autocorr

Tue Dec 12 23:05:03 EET 2023

Le tiistaina 12. joulukuuta 2023, 23.02.40 EET Rémi Denis-Courmont a écrit :
> The loop iterates over the length of the vector, not the order. This is
> to avoid reloading the same data for each lag value. However this means
> the loop only works if the maximum order is no larger than VLENB.
> 
> The loop is roughly equivalent to:
> 
>     for (size_t j = 0; j < lag; j++)
>         autoc[j] = 1.;
> 
>     while (len > lag) {
>         for (ptrdiff_t j = 0; j < lag; j++)
>             autoc[j] += data[j] * *data;
>         data++;
>         len--;
>     }
> 
>     while (len > 0) {
>         for (ptrdiff_t j = 0; j < len; j++)
>             autoc[j] += data[j] * *data;
>         data++;
>         len--;
>     }
> 
> Since register pressure is only at 50%, it should be possible to implement
> the same loop for order up to 2xVLENB. But this is left for future work.
> 
> Performance numbers are all over the place from ~1.25x to ~4x speedups,
> but at least they are always noticeably better than nothing.
> ---
>  libavcodec/riscv/lpc_init.c |  8 +++++++-
>  libavcodec/riscv/lpc_rvv.S  | 29 +++++++++++++++++++++++++++++
>  2 files changed, 36 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/riscv/lpc_init.c b/libavcodec/riscv/lpc_init.c
> index c16e5745f0..ab91956f2d 100644
> --- a/libavcodec/riscv/lpc_init.c
> +++ b/libavcodec/riscv/lpc_init.c
> @@ -22,16 +22,22 @@
> 
>  #include "libavutil/attributes.h"
>  #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
>  #include "libavcodec/lpc.h"
> 
>  void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *);
> +void ff_lpc_compute_autocorr_rvv(const double *, ptrdiff_t, int, double *);
> 
>  av_cold void ff_lpc_init_riscv(LPCContext *c)
>  {
>  #if HAVE_RVV && (__riscv_xlen >= 64)
>      int flags = av_get_cpu_flags();
> 
> -    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR))
> +    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
>          c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv;
> +
> +        if (ff_get_rv_vlenb() >= c->max_order)
> +            c->lpc_compute_autocorr = ff_lpc_compute_autocorr_rvv;
> +    }
>  #endif
>  }
> diff --git a/libavcodec/riscv/lpc_rvv.S b/libavcodec/riscv/lpc_rvv.S
> index f81a2392c1..654156bf12 100644
> --- a/libavcodec/riscv/lpc_rvv.S
> +++ b/libavcodec/riscv/lpc_rvv.S
> @@ -85,4 +85,33 @@ func ff_lpc_apply_welch_window_rvv, zve64d
> 
>          ret
>  endfunc
> +
> +func ff_lpc_compute_autocorr_rvv, zve64d
> +        li        t0, 1
> +        vsetvli   t1, a2, e64, m8, ta, ma

t1 is unused and should be zero. This is leftover from incomplete attempt to 
unroll.

> +        fcvt.d.l  ft0, t0
> +        vle64.v   v0, (a0)
> +        sh3add    a0, a2, a0   # data += lag
> +        vfmv.v.f  v16, ft0
> +        bge       a2, a1, 2f
> +1:
> +        vfmv.f.s  ft0, v0
> +        fld       ft1, (a0)    # ft1 = data[lag + i]
> +        vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j]
> +        addi      a1, a1, -1
> +        vfslide1down.vf v0, v0, ft1
> +        addi      a0, a0, 8
> +        bgt       a1, a2, 1b   # while (len > lag);
> +2:
> +        vfmv.f.s  ft0, v0
> +        vsetvli   zero, a1, e64, m8, tu, ma
> +        vfmacc.vf v16, ft0, v0
> +        addi      a1, a1, -1
> +        vslide1down.vx v0, v0, zero
> +        bnez      a1, 2b       # while (len > 0);
> +
> +        vsetvli   zero, a2, e64, m8, ta, ma
> +        vse64.v   v16, (a3)
> +        ret
> +endfunc
>  #endif

-- 
レミ・デニ-クールモン
http://www.remlab.net/