[FFmpeg-devel] [PATCH] Altivec split-radix FFT

Fri Sep 18 16:58:20 CEST 2009

Loren Merritt <lorenm at u.washington.edu> writes:

> Now without Apple workarounds.
>
> --Loren Merritt
>
> From 100bf86add9537c20f4db2c5d2eeb2880c9c705e Mon Sep 17 00:00:00 2001
> From: Loren Merritt <pengvado at akuvian.org>
> Date: Sat, 22 Aug 2009 11:22:09 +0100
> Subject: [PATCH 1/3] altivec split-radix FFT
>  1.8x faster than altivec radix-2 on a G4
>  8% faster vorbis decoding
>
> ---
>  configure                      |    2 +
>  libavcodec/Makefile            |    4 +-
>  libavcodec/fft.c               |    2 +-
>  libavcodec/ppc/asm.S           |   59 ++++++++
>  libavcodec/ppc/fft_altivec.c   |  152 +++++++-------------
>  libavcodec/ppc/fft_altivec_s.S |  322 ++++++++++++++++++++++++++++++++++++++++
>  libavcodec/ppc/types_altivec.h |    1 +
>  7 files changed, 440 insertions(+), 102 deletions(-)
>  create mode 100644 libavcodec/ppc/asm.S
>  create mode 100644 libavcodec/ppc/fft_altivec_s.S
>
> diff --git a/configure b/configure
> index 6aa6177..fdb6ade 100755
> --- a/configure
> +++ b/configure
> @@ -933,6 +933,7 @@ HAVE_LIST="
>      fast_cmov
>      fast_unaligned
>      fork
> +    fsf_gas
>      gethrtime
>      GetProcessTimes
>      getrusage
> @@ -2108,6 +2109,7 @@ elif enabled mips; then
>  elif enabled ppc; then
>
>      check_asm dcbzl     '"dcbzl 0, 1"'
> +    check_asm fsf_gas   '"add 0, 0, 0"'
>      check_asm ppc4xx    '"maclhw r10, r11, r12"'
>      check_asm xform_asm '"lwzx 0, %y0" :: "Z"(*(int*)0)'

This is the syntax the IBM PPC documentation uses, so branding it fsf
seems a bit wrong.

> diff --git a/libavcodec/fft.c b/libavcodec/fft.c
> index c827139..52bc73a 100644
> --- a/libavcodec/fft.c
> +++ b/libavcodec/fft.c
> @@ -89,7 +89,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
>      s->split_radix = 1;
>
>      if (ARCH_ARM)     ff_fft_init_arm(s);
> -    if (HAVE_ALTIVEC) ff_fft_init_altivec(s);
> +    if (HAVE_ALTIVEC && HAVE_FSF_GAS) ff_fft_init_altivec(s);
>      if (HAVE_MMX)     ff_fft_init_mmx(s);
>
>      if (s->split_radix) {

I'd rather hide this condition inside ff_fft_init_altivec().

> diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c
> index 83b2b7f..235f205 100644
> --- a/libavcodec/ppc/fft_altivec.c
> +++ b/libavcodec/ppc/fft_altivec.c
> [...]
> +extern FFTSample ff_cos_16[];

Isn't this declared in some header file?  If not, it should be.

> +// pointers to functions. but unlike function pointers on some PPC ABIs, these aren't function descriptors.

What does that mean?  Not saying it's wrong, I just don't understand it.

> From fc014fba0a5c39e47d2cb651856d1b1e83c4edf9 Mon Sep 17 00:00:00 2001
> From: Loren Merritt <pengvado at akuvian.org>
> Date: Wed, 26 Aug 2009 09:44:32 +0100
> Subject: [PATCH 2/3] remove vestiges of radix-2 FFT
>
> ---
>  libavcodec/dsputil.h         |    3 -
>  libavcodec/fft.c             |   83 +++---------------------------------------------------
>  libavcodec/ppc/fft_altivec.c |    1 
>  3 files changed, 5 insertions(+), 82 deletions(-)
>
> diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
> index d100d5f..901f8a7 100644
> --- a/libavcodec/dsputil.h
> +++ b/libavcodec/dsputil.h
> @@ -675,15 +675,12 @@ typedef struct FFTContext {
>      int nbits;
>      int inverse;
>      uint16_t *revtab;
> -    FFTComplex *exptab;
> -    FFTComplex *exptab1; /* only used by SSE code */

I need to update the offsets in the NEON code for this, so don't
commit without telling me first.

> diff --git a/libavcodec/fft.c b/libavcodec/fft.c
> index 52bc73a..a7c7b96 100644
> --- a/libavcodec/fft.c
> +++ b/libavcodec/fft.c
> @@ -60,39 +60,30 @@ static int split_radix_permutation(int i, int n, int inverse)
>
>  av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
>  {
> -    int i, j, m, n;
> -    float alpha, c1, s1, s2;
> -    int av_unused has_vectors;
> +    int i, j, n;
>
>      if (nbits < 2 || nbits > 16)
>          goto fail;
>      s->nbits = nbits;
>      n = 1 << nbits;
>
> -    s->tmp_buf = NULL;
> -    s->exptab  = av_malloc((n / 2) * sizeof(FFTComplex));
> -    if (!s->exptab)
> -        goto fail;
>      s->revtab = av_malloc(n * sizeof(uint16_t));
>      if (!s->revtab)
>          goto fail;
> +    s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
> +    if (!s->tmp_buf)
> +        goto fail;
>      s->inverse = inverse;
>
> -    s2 = inverse ? 1.0 : -1.0;
> -
>      s->fft_permute = ff_fft_permute_c;
>      s->fft_calc    = ff_fft_calc_c;
>      s->imdct_calc  = ff_imdct_calc_c;
>      s->imdct_half  = ff_imdct_half_c;
> -    s->mdct_calc   = ff_mdct_calc_c;

I think that line should stay.

> -    s->exptab1     = NULL;
> -    s->split_radix = 1;
>
>      if (ARCH_ARM)     ff_fft_init_arm(s);
>      if (HAVE_ALTIVEC && HAVE_FSF_GAS) ff_fft_init_altivec(s);
>      if (HAVE_MMX)     ff_fft_init_mmx(s);
>
> diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c
> index 235f205..9f01992 100644
> --- a/libavcodec/ppc/fft_altivec.c
> +++ b/libavcodec/ppc/fft_altivec.c
> @@ -89,5 +89,4 @@ void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
>  av_cold void ff_fft_init_altivec(FFTContext *s)
>  {
>      s->fft_calc = ff_fft_calc_altivec;
> -    s->split_radix = 1;
>  }
>
> From e2e5f93792d62becaa666723ce26f81472e2eddc Mon Sep 17 00:00:00 2001
> From: Loren Merritt <pengvado at akuvian.org>
> Date: Wed, 26 Aug 2009 10:08:06 +0100
> Subject: [PATCH 3/3] altivec imdct
>  1.8x faster than C iMDCT (excluding the FFT part) on a G4
>  10% faster vorbis decoding
>
> ---
>  libavcodec/ppc/fft_altivec.c |  113 ++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 110 insertions(+), 3 deletions(-)
>
> diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c
> index 9f01992..0d4ced3 100644
> --- a/libavcodec/ppc/fft_altivec.c
> +++ b/libavcodec/ppc/fft_altivec.c
> @@ -49,7 +49,7 @@ static void swizzle(vec_f *z, int n)
>      }
>  }
>
> -void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
> +static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_swizzle)
>  {
>      register vec_f  v14 __asm__("v14") = {0,0,0,0};
>      register vec_f  v15 __asm__("v15") = *(const vec_f*)ff_cos_16;
> @@ -75,18 +75,125 @@ void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
>          "subi 1,1,%1 \n"
>          "bctrl \n"
>          "addi 1,1,%1 \n"
> -        ::"r"(ff_fft_dispatch_altivec[1][s->nbits-2]), "i"(12*sizeof(void*)),
> +        ::"r"(ff_fft_dispatch_altivec[do_swizzle][s->nbits-2]), "i"(12*sizeof(void*)),
>            "r"(zarg), "r"(cos_tabs),
>            "v"(v14),"v"(v15),"v"(v16),"v"(v17),"v"(v18),"v"(v19),"v"(v20),"v"(v21),
>            "v"(v22),"v"(v23),"v"(v24),"v"(v25),"v"(v26),"v"(v27),"v"(v28),"v"(v29)
>          : "lr","ctr","r0","r4","r5","r6","r7","r8","r9","r10","r11",
>            "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13"
>      );
> -    if (s->nbits <= 4)
> +    if (do_swizzle && s->nbits <= 4)
>          swizzle((vec_f*)z, 1<<s->nbits);
>  }
>
> +static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
> +{
> +    fft_dispatch(s, z, 1);
> +}
> +
> +static void ff_imdct_half_altivec(MDCTContext *s, FFTSample *output, const FFTSample *input)
> +{
> +    int j, k;
> +    int n = 1 << s->nbits;
> +    int n4 = n >> 2;
> +    int n8 = n >> 3;
> +    int n32 = n >> 5;
> +    const uint16_t *revtabj = s->fft.revtab;
> +    const uint16_t *revtabk = s->fft.revtab+n4;
> +    const vec_f *tcos = (const vec_f*)(s->tcos+n8);
> +    const vec_f *tsin = (const vec_f*)(s->tsin+n8);
> +    const vec_f *pin = (const vec_f*)(input+n4);
> +    vec_f *pout = (vec_f*)(output+n4);

Why the intrinsics here?

-- 
M?ns Rullg?rd
mans at mansr.com