[FFmpeg-devel] [PATCH] Altivec split-radix FFT
Måns Rullgård
mans
Fri Sep 18 16:58:20 CEST 2009
Loren Merritt <lorenm at u.washington.edu> writes:
> Now without Apple workarounds.
>
> --Loren Merritt
>
> From 100bf86add9537c20f4db2c5d2eeb2880c9c705e Mon Sep 17 00:00:00 2001
> From: Loren Merritt <pengvado at akuvian.org>
> Date: Sat, 22 Aug 2009 11:22:09 +0100
> Subject: [PATCH 1/3] altivec split-radix FFT
> 1.8x faster than altivec radix-2 on a G4
> 8% faster vorbis decoding
>
> ---
> configure | 2 +
> libavcodec/Makefile | 4 +-
> libavcodec/fft.c | 2 +-
> libavcodec/ppc/asm.S | 59 ++++++++
> libavcodec/ppc/fft_altivec.c | 152 +++++++-------------
> libavcodec/ppc/fft_altivec_s.S | 322 ++++++++++++++++++++++++++++++++++++++++
> libavcodec/ppc/types_altivec.h | 1 +
> 7 files changed, 440 insertions(+), 102 deletions(-)
> create mode 100644 libavcodec/ppc/asm.S
> create mode 100644 libavcodec/ppc/fft_altivec_s.S
>
> diff --git a/configure b/configure
> index 6aa6177..fdb6ade 100755
> --- a/configure
> +++ b/configure
> @@ -933,6 +933,7 @@ HAVE_LIST="
> fast_cmov
> fast_unaligned
> fork
> + fsf_gas
> gethrtime
> GetProcessTimes
> getrusage
> @@ -2108,6 +2109,7 @@ elif enabled mips; then
> elif enabled ppc; then
>
> check_asm dcbzl '"dcbzl 0, 1"'
> + check_asm fsf_gas '"add 0, 0, 0"'
> check_asm ppc4xx '"maclhw r10, r11, r12"'
> check_asm xform_asm '"lwzx 0, %y0" :: "Z"(*(int*)0)'
This is the syntax the IBM PPC documentation uses, so branding it fsf
seems a bit wrong.
> diff --git a/libavcodec/fft.c b/libavcodec/fft.c
> index c827139..52bc73a 100644
> --- a/libavcodec/fft.c
> +++ b/libavcodec/fft.c
> @@ -89,7 +89,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
> s->split_radix = 1;
>
> if (ARCH_ARM) ff_fft_init_arm(s);
> - if (HAVE_ALTIVEC) ff_fft_init_altivec(s);
> + if (HAVE_ALTIVEC && HAVE_FSF_GAS) ff_fft_init_altivec(s);
> if (HAVE_MMX) ff_fft_init_mmx(s);
>
> if (s->split_radix) {
I'd rather hide this condition inside ff_fft_init_altivec().
> diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c
> index 83b2b7f..235f205 100644
> --- a/libavcodec/ppc/fft_altivec.c
> +++ b/libavcodec/ppc/fft_altivec.c
> [...]
> +extern FFTSample ff_cos_16[];
Isn't this declared in some header file? If not, it should be.
> +// pointers to functions. but unlike function pointers on some PPC ABIs, these aren't function descriptors.
What does that mean? Not saying it's wrong, I just don't understand it.
> From fc014fba0a5c39e47d2cb651856d1b1e83c4edf9 Mon Sep 17 00:00:00 2001
> From: Loren Merritt <pengvado at akuvian.org>
> Date: Wed, 26 Aug 2009 09:44:32 +0100
> Subject: [PATCH 2/3] remove vestiges of radix-2 FFT
>
> ---
> libavcodec/dsputil.h | 3 -
> libavcodec/fft.c | 83 +++---------------------------------------------------
> libavcodec/ppc/fft_altivec.c | 1
> 3 files changed, 5 insertions(+), 82 deletions(-)
>
> diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
> index d100d5f..901f8a7 100644
> --- a/libavcodec/dsputil.h
> +++ b/libavcodec/dsputil.h
> @@ -675,15 +675,12 @@ typedef struct FFTContext {
> int nbits;
> int inverse;
> uint16_t *revtab;
> - FFTComplex *exptab;
> - FFTComplex *exptab1; /* only used by SSE code */
I need to update the offsets in the NEON code for this, so don't
commit without telling me first.
> diff --git a/libavcodec/fft.c b/libavcodec/fft.c
> index 52bc73a..a7c7b96 100644
> --- a/libavcodec/fft.c
> +++ b/libavcodec/fft.c
> @@ -60,39 +60,30 @@ static int split_radix_permutation(int i, int n, int inverse)
>
> av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
> {
> - int i, j, m, n;
> - float alpha, c1, s1, s2;
> - int av_unused has_vectors;
> + int i, j, n;
>
> if (nbits < 2 || nbits > 16)
> goto fail;
> s->nbits = nbits;
> n = 1 << nbits;
>
> - s->tmp_buf = NULL;
> - s->exptab = av_malloc((n / 2) * sizeof(FFTComplex));
> - if (!s->exptab)
> - goto fail;
> s->revtab = av_malloc(n * sizeof(uint16_t));
> if (!s->revtab)
> goto fail;
> + s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
> + if (!s->tmp_buf)
> + goto fail;
> s->inverse = inverse;
>
> - s2 = inverse ? 1.0 : -1.0;
> -
> s->fft_permute = ff_fft_permute_c;
> s->fft_calc = ff_fft_calc_c;
> s->imdct_calc = ff_imdct_calc_c;
> s->imdct_half = ff_imdct_half_c;
> - s->mdct_calc = ff_mdct_calc_c;
I think that line should stay.
> - s->exptab1 = NULL;
> - s->split_radix = 1;
>
> if (ARCH_ARM) ff_fft_init_arm(s);
> if (HAVE_ALTIVEC && HAVE_FSF_GAS) ff_fft_init_altivec(s);
> if (HAVE_MMX) ff_fft_init_mmx(s);
>
> diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c
> index 235f205..9f01992 100644
> --- a/libavcodec/ppc/fft_altivec.c
> +++ b/libavcodec/ppc/fft_altivec.c
> @@ -89,5 +89,4 @@ void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
> av_cold void ff_fft_init_altivec(FFTContext *s)
> {
> s->fft_calc = ff_fft_calc_altivec;
> - s->split_radix = 1;
> }
>
> From e2e5f93792d62becaa666723ce26f81472e2eddc Mon Sep 17 00:00:00 2001
> From: Loren Merritt <pengvado at akuvian.org>
> Date: Wed, 26 Aug 2009 10:08:06 +0100
> Subject: [PATCH 3/3] altivec imdct
> 1.8x faster than C iMDCT (excluding the FFT part) on a G4
> 10% faster vorbis decoding
>
> ---
> libavcodec/ppc/fft_altivec.c | 113 ++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 110 insertions(+), 3 deletions(-)
>
> diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c
> index 9f01992..0d4ced3 100644
> --- a/libavcodec/ppc/fft_altivec.c
> +++ b/libavcodec/ppc/fft_altivec.c
> @@ -49,7 +49,7 @@ static void swizzle(vec_f *z, int n)
> }
> }
>
> -void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
> +static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_swizzle)
> {
> register vec_f v14 __asm__("v14") = {0,0,0,0};
> register vec_f v15 __asm__("v15") = *(const vec_f*)ff_cos_16;
> @@ -75,18 +75,125 @@ void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
> "subi 1,1,%1 \n"
> "bctrl \n"
> "addi 1,1,%1 \n"
> - ::"r"(ff_fft_dispatch_altivec[1][s->nbits-2]), "i"(12*sizeof(void*)),
> + ::"r"(ff_fft_dispatch_altivec[do_swizzle][s->nbits-2]), "i"(12*sizeof(void*)),
> "r"(zarg), "r"(cos_tabs),
> "v"(v14),"v"(v15),"v"(v16),"v"(v17),"v"(v18),"v"(v19),"v"(v20),"v"(v21),
> "v"(v22),"v"(v23),"v"(v24),"v"(v25),"v"(v26),"v"(v27),"v"(v28),"v"(v29)
> : "lr","ctr","r0","r4","r5","r6","r7","r8","r9","r10","r11",
> "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13"
> );
> - if (s->nbits <= 4)
> + if (do_swizzle && s->nbits <= 4)
> swizzle((vec_f*)z, 1<<s->nbits);
> }
>
> +static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
> +{
> + fft_dispatch(s, z, 1);
> +}
> +
> +static void ff_imdct_half_altivec(MDCTContext *s, FFTSample *output, const FFTSample *input)
> +{
> + int j, k;
> + int n = 1 << s->nbits;
> + int n4 = n >> 2;
> + int n8 = n >> 3;
> + int n32 = n >> 5;
> + const uint16_t *revtabj = s->fft.revtab;
> + const uint16_t *revtabk = s->fft.revtab+n4;
> + const vec_f *tcos = (const vec_f*)(s->tcos+n8);
> + const vec_f *tsin = (const vec_f*)(s->tsin+n8);
> + const vec_f *pin = (const vec_f*)(input+n4);
> + vec_f *pout = (vec_f*)(output+n4);
Why the intrinsics here?
--
M?ns Rullg?rd
mans at mansr.com
More information about the ffmpeg-devel
mailing list