[FFmpeg-devel] [PATCH] x86/aacpsdsp: add SSE and SSE3 optimized functions
Michael Niedermayer
michael at niedermayer.cc
Sat Jul 25 14:39:20 CEST 2015
On Fri, Jul 24, 2015 at 11:00:55PM -0300, James Almer wrote:
> Between 1.5 and 2.5 times faster
>
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
> There's a couple missing, like ps_stereo_interpolate_ipdopd which i wanted to write
> but couldn't test because it was not used by any of the samples i tried.
>
> libavcodec/aacps.c | 4 +-
> libavcodec/aacpsdsp.h | 1 +
> libavcodec/aacpsdsp_template.c | 2 +
> libavcodec/x86/Makefile | 6 +-
> libavcodec/x86/aacpsdsp.asm | 212 +++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/aacpsdsp_init.c | 55 +++++++++++
> 6 files changed, 276 insertions(+), 4 deletions(-)
> create mode 100644 libavcodec/x86/aacpsdsp.asm
> create mode 100644 libavcodec/x86/aacpsdsp_init.c
>
> diff --git a/libavcodec/aacps.c b/libavcodec/aacps.c
> index bf60475..eec6e30 100644
> --- a/libavcodec/aacps.c
> +++ b/libavcodec/aacps.c
> @@ -936,8 +936,8 @@ static void stereo_processing(PSContext *ps, INTFLOAT (*l)[32][2], INTFLOAT (*r)
> H22[0][e+1][b] = h22;
> }
> for (k = 0; k < NR_BANDS[is34]; k++) {
> - INTFLOAT h[2][4];
> - INTFLOAT h_step[2][4];
> + LOCAL_ALIGNED_16(INTFLOAT, h, [2], [4]);
> + LOCAL_ALIGNED_16(INTFLOAT, h_step, [2], [4]);
> int start = ps->border_position[e];
> int stop = ps->border_position[e+1];
> INTFLOAT width = Q30(1.f) / (stop - start);
> diff --git a/libavcodec/aacpsdsp.h b/libavcodec/aacpsdsp.h
> index 9e3c5aa..c194bbe 100644
> --- a/libavcodec/aacpsdsp.h
> +++ b/libavcodec/aacpsdsp.h
> @@ -52,5 +52,6 @@ typedef struct PSDSPContext {
> void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s);
> void ff_psdsp_init_arm(PSDSPContext *s);
> void ff_psdsp_init_mips(PSDSPContext *s);
> +void ff_psdsp_init_x86(PSDSPContext *s);
>
> #endif /* LIBAVCODEC_AACPSDSP_H */
> diff --git a/libavcodec/aacpsdsp_template.c b/libavcodec/aacpsdsp_template.c
> index bfec828..3049ce8 100644
> --- a/libavcodec/aacpsdsp_template.c
> +++ b/libavcodec/aacpsdsp_template.c
> @@ -224,5 +224,7 @@ av_cold void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s)
> ff_psdsp_init_arm(s);
> if (ARCH_MIPS)
> ff_psdsp_init_mips(s);
> + if (ARCH_X86)
> + ff_psdsp_init_x86(s);
> #endif /* !USE_FIXED */
> }
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index a515ebd..c403770 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -38,7 +38,8 @@ OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
> OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
>
> # decoders/encoders
> -OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
> +OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
> + x86/sbrdsp_init.o
> OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
> OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
> OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
> @@ -130,7 +131,8 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
> x86/vp8dsp_loopfilter.o
>
> # decoders/encoders
> -YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
> +YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
> + x86/sbrdsp.o
> YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
> YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
> YASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
> diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
> new file mode 100644
> index 0000000..d416944
> --- /dev/null
> +++ b/libavcodec/x86/aacpsdsp.asm
> @@ -0,0 +1,212 @@
> +;******************************************************************************
> +;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
> +;*
> +;* Copyright (C) 2015 James Almer
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
> +
> +SECTION_TEXT
> +
> +;*************************************************************************
> +;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
> +;*************************************************************************
> +%macro PS_ADD_SQUARES 1
> +cglobal ps_add_squares, 3, 3, %1, dst, src, n
> +.loop:
> + movaps m0, [srcq]
> + movaps m1, [srcq+mmsize]
> + mulps m0, m0
> + mulps m1, m1
> +%if cpuflag(sse3)
> + haddps m0, m1
> +%else
> + movaps m3, m0
> + movaps m4, m1
> + shufps m3, m3, q0301
> + shufps m4, m4, q0301
> + addps m0, m3
> + addps m1, m4
> + shufps m0, m1, q2020
> +%endif
> + addps m0, [dstq]
> + movaps [dstq], m0
> + add dstq, mmsize
> + add srcq, mmsize*2
> + sub nd, mmsize/4
> + jg .loop
> + REP_RET
> +%endmacro
> +
> +INIT_XMM sse
> +PS_ADD_SQUARES 3
> +INIT_XMM sse3
> +PS_ADD_SQUARES 5
> +
> +;*******************************************************************
> +;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
> +; float *src1, int n);
> +;*******************************************************************
> +INIT_XMM sse
> +cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n
> + xor r4q, r4q
> +
> +.loop:
> + movu m0, [src1q+r4q]
> + movu m1, [src1q+r4q+mmsize]
> + mova m2, [src2q]
> + mova m3, m2
> + unpcklps m2, m2
> + unpckhps m3, m3
> + mulps m0, m2
> + mulps m1, m3
> + mova [dstq+r4q], m0
> + mova [dstq+r4q+mmsize], m1
> + add src2q, mmsize
> + add r4q, mmsize*2
> + sub nd, mmsize/4
> + jg .loop
> + REP_RET
> +
> +;***********************************************************************
> +;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
> +; float h[2][4], float h_step[2][4],
> +; int len);
> +;***********************************************************************
> +INIT_XMM sse3
> +cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
> + movaps m0, [hq]
> + movaps m1, [h_stepq]
> + shl nd, 3
> + add lq, nq
> + add rq, nq
> + neg nq
> +
> +align 16
> +.loop:
this assumes n >= 0
i dont think the calling code guratees this
either the calling code should be changed or this should be checked
for
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
In a rich man's house there is no place to spit but his face.
-- Diogenes of Sinope
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150725/b32076ec/attachment.sig>
More information about the ffmpeg-devel
mailing list