[FFmpeg-devel] [PATCH 04/11] x86: dcadsp: implement SSE lfe_dir
Michael Niedermayer
michaelni at gmx.at
Tue Feb 11 03:01:15 CET 2014
On Fri, Feb 07, 2014 at 10:35:22PM +0100, Christophe Gisquet wrote:
> Hi,
>
> 2014-02-07 Loren Merritt <lorenm at u.washington.edu>:
> > On Thu, 6 Feb 2014, Christophe Gisquet wrote:
> >
> >> diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
> >> index 03593ce..4a682be 100644
> >> --- a/libavcodec/x86/dcadsp.asm
> >> +++ b/libavcodec/x86/dcadsp.asm
> >> @@ -88,3 +88,108 @@ INT8X8_FMUL_INT32 3
> >>
> >> INIT_XMM sse4
> >> INT8X8_FMUL_INT32 3
> >> +
> >> +; %1=v0/v1 %2=in1 %3=in2
> >> +%macro FIR_LOOP 2-3
> >> +.loop%1:
> >> +%define va m1
> >> +%define vb m2
> >> +%if %1
> >> +%define OFFSET 0
> >> +%else
> >> +%define OFFSET NUM_COEF*count
> >> +%endif
> >> +; for v0, incrementint and for v1, decrementing
> >> + mova va, [cf0q + OFFSET]
> >> + mova vb, [cf0q + OFFSET + 4*NUM_COEF]
> >> +%if %0 == 3
> >> + mova m4, [cf0q + OFFSET + mmsize]
> >> + mova SCALE, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
> >> +%endif
> >> + mulps va, %2
> >> + mulps vb, %2
> >> +%if %0 == 3
> >> + mulps m4, %3
> >> + mulps SCALE, %3
> >> + addps va, m4
> >> + addps vb, SCALE
> >> +%endif
> >> + ; va = va1 va2 va3 va4
> >> + ; vb = vb1 vb2 vb3 vb4
> >> +%if %1
> >> +%define O1 vb
> >> +%define O2 va
> >> +%else
> >> +%define O1 va
> >> +%define O2 vb
> >> +%endif
> >
> > Can this be simplified with
> > %if %1
> > SWAP va, vb
> > %endif
> > and no O1, O2 variables?
> >
> >> + mova m4, O1
> >> + unpcklps O1, O2 ; va3 vb3 va4 vb4
> >> + unpckhps m4, O2 ; va1 vb1 va2 vb2
> >> + addps m4, O1 ; va1+3 vb1+3 va2+4 vb2+4
> >> + movhlps O2, m4 ; va1+3 vb1+3
> >> + addps O2, m4 ; va0..4 vb0..4
> >> +%if %1
> >> + movh [outq + count], O2
> >> + sub cf0q, 8*NUM_COEF
> >> +%else
> >> + movh [outq + count], O2
> >
> > factor out of the %if
>
> All was ok, so here's a new patch.
>
> --
> Christophe
> dcadsp.asm | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> dcadsp_init.c | 6 +++
> 2 files changed, 105 insertions(+)
> e15392342e6a1a9c3b150d5dbc5b4054a16e1c91 0004-x86-dcadsp-implement-SSE-lfe_dir.patch
> From b60bf426995afef4e5673412a50a994f6d581b18 Mon Sep 17 00:00:00 2001
> From: Christophe Gisquet <christophe.gisquet at gmail.com>
> Date: Wed, 19 Dec 2012 20:26:05 +0100
> Subject: [PATCH 04/10] x86: dcadsp: implement SSE lfe_dir
I think you can merge the scale factor into the lfe_fir_* tables
avoiding some instructions
also the coeff table looks constant so you can reorder it any
way at no cost
and the whole code looks like a 4 input 128 output or
8 input 64 output matrix multiplication with a constant matrix
Not sure whats the fastest way to implement this but
you could form all 4 needed permutations of the input and then do a
simpler 4x(mova, mulps, addps) inner loop
I maybe have missed a detail here or there but i suspect this can
be done more efficiently than how its implemented (with differently
ordered coeff tables)
>
> Results for Arrandale/Windows:
> 32: 1670 -> 316
> 64: 728 -> 298
> ---
> libavcodec/x86/dcadsp.asm | 99 ++++++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/dcadsp_init.c | 6 +++
> 2 files changed, 105 insertions(+)
>
> diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
> index 214f514..731854e 100644
> --- a/libavcodec/x86/dcadsp.asm
> +++ b/libavcodec/x86/dcadsp.asm
> @@ -88,3 +88,102 @@ INT8X8_FMUL_INT32
>
> INIT_XMM sse4
> INT8X8_FMUL_INT32
> +
> +; %1=v0/v1 %2=in1 %3=in2
> +%macro FIR_LOOP 2-3
> +.loop%1:
> +%define va m1
> +%define vb m2
> +%if %1
> +%define OFFSET 0
> +%else
> +%define OFFSET NUM_COEF*count
> +%endif
> +; for v0, incrementint and for v1, decrementing
> + mova va, [cf0q + OFFSET]
> + mova vb, [cf0q + OFFSET + 4*NUM_COEF]
> +%if %0 == 3
> + mova m4, [cf0q + OFFSET + mmsize]
> + mova SCALE, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
> +%endif
> + mulps va, %2
> + mulps vb, %2
> +%if %0 == 3
> + mulps m4, %3
> + mulps SCALE, %3
> + addps va, m4
> + addps vb, SCALE
> +%endif
> + ; va = va1 va2 va3 va4
> + ; vb = vb1 vb2 vb3 vb4
> +%if %1
> + SWAP va, vb
> +%endif
> + mova m4, va
> + unpcklps va, vb ; va3 vb3 va4 vb4
> + unpckhps m4, vb ; va1 vb1 va2 vb2
> + addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
> + movhlps vb, m4 ; va1+3 vb1+3
> + addps vb, m4 ; va0..4 vb0..4
> + movh [outq + count], vb
> +%if %1
> + sub cf0q, 8*NUM_COEF
> +%endif
> + add count, 8
> + jl .loop%1
> +%endmacro
> +
> +; dca_lfe_fir(float *out, float *in, float *coefs, float scale)
> +%macro DCA_LFE_FIR 1
> +cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0, scale
> +
> +%if WIN64
> + SWAP 0, 3
> +%endif
> +%define SCALE m0
> +%define IN1 m3
> +%define IN2 m5
> +%define count inq
> +%define NUM_COEF 4*(2-%1)
> +%define NUM_OUT 32*(%1+1)
> +
> +%if ARCH_X86_32
> + movss SCALE, scalem
> +%endif
> +
> + movu IN1, [inq + 4 - 1*mmsize]
> + shufps IN1, IN1, q0123
> +%if %1 == 0
> + movu IN2, [inq + 4 - 2*mmsize]
> + shufps IN2, IN2, q0123
> +%endif
> +
> + mov count, -4*NUM_OUT
> + SPLATD SCALE
> + add cf0q, 4*NUM_COEF*NUM_OUT
> + add outq, 4*NUM_OUT
> + ; compute v0 first
> + mulps IN1, SCALE
> +%if %1 == 0
> + mulps IN2, SCALE
> + FIR_LOOP 0, IN1, IN2
> +%else
> + FIR_LOOP 0, IN1
> +%endif
> + shufps IN1, IN1, q0123
> + mov count, -4*NUM_OUT
> + ; cf1 already correctly positioned
> + add outq, 4*NUM_OUT ; outq now at out2
> + sub cf0q, 8*NUM_COEF
> +%if %1 == 0
> + shufps IN2, IN2, q0123
> + FIR_LOOP 1, IN2, IN1
> +%else
> + FIR_LOOP 1, IN1
> +%endif
> + RET
> +%endmacro
> +
> +INIT_XMM sse
> +DCA_LFE_FIR 0
> +DCA_LFE_FIR 1
> diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
> index 976d8a3..d649ecd 100644
> --- a/libavcodec/x86/dcadsp_init.c
> +++ b/libavcodec/x86/dcadsp_init.c
> @@ -26,6 +26,10 @@
> void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale);
> void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale);
> void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale);
> +void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs,
> + float scale);
> +void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs,
> + float scale);
>
> av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
> {
> @@ -35,6 +39,8 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
> #if ARCH_X86_32
> s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse;
> #endif
> + s->lfe_fir[0] = ff_dca_lfe_fir0_sse;
> + s->lfe_fir[1] = ff_dca_lfe_fir1_sse;
> }
>
> if (EXTERNAL_SSE2(cpu_flags)) {
> --
> 1.8.0.msysgit.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
it is not once nor twice but times without number that the same ideas make
their appearance in the world. -- Aristotle
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140211/e0d10815/attachment.asc>
More information about the ffmpeg-devel
mailing list