[FFmpeg-devel] [PATCH 1/7] lavc/flacenc: add sse4 version of the lpc encoder
Michael Niedermayer
michaelni at gmx.at
Sat Feb 15 20:44:39 CET 2014
On Sat, Feb 15, 2014 at 02:54:57AM +0100, James Darnley wrote:
> From 1.8 to 2.4 times faster. Runtime is reduced by 2 to 39%. The
> speed-up generally increases with compression_level.
>
> This lpc encoder is not used with levels < 3 so it provides no speed-up
> in these cases.
> ---
> LICENSE | 1 +
> libavcodec/x86/Makefile | 3 +
> libavcodec/x86/flac_dsp_gpl.asm | 78 +++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/flacdsp_init.c | 4 ++
> 4 files changed, 86 insertions(+), 0 deletions(-)
> create mode 100644 libavcodec/x86/flac_dsp_gpl.asm
>
> diff --git a/LICENSE b/LICENSE
> index 1f757aa..490adff 100644
> --- a/LICENSE
> +++ b/LICENSE
> @@ -16,6 +16,7 @@ Specifically, the GPL parts of FFmpeg are
> - libmpcodecs
> - optional x86 optimizations in the files
> libavcodec/x86/idct_mmx.c
> + libavcodec/x86/flac_dsp_gpl.asm
> - libutvideo encoding/decoding wrappers in
> libavcodec/libutvideo*.cpp
> - the X11 grabber in libavdevice/x11grab.c
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 7b56178..2cf9d2c 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -75,6 +75,9 @@ YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
> YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
> YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
> YASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
> +ifdef CONFIG_GPL
> +YASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
> +endif
> YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
> YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
> x86/h264_chromamc_10bit.o
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
> new file mode 100644
> index 0000000..3ce5fdf
> --- /dev/null
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -0,0 +1,78 @@
> +;*****************************************************************************
> +;* FLAC DSP functions
> +;*
> +;* Copyright (c) 2014 James Darnley <james.darnley at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or modify
> +;* it under the terms of the GNU General Public License as published by
> +;* the Free Software Foundation; either version 2 of the License, or
> +;* (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +;* GNU General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU General Public License along
> +;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_TEXT
> +
> +INIT_XMM sse4
> +%if ARCH_X86_64
> + cglobal flac_enc_lpc_16, 5, 7, 4, 0, res, smp, len, order, coefs, shift
> + %define posj r5
> + %define negj r6
> +%else
> + cglobal flac_enc_lpc_16, 5, 6, 4, 0, res, smp, len, order, coefs, shift
> + %define posj r2
> + %define negj r5
> +%endif
> +
> +; Is it worth looping correctly over the first samples? The most that ever need
> +; to be copied is 32 so we might as well just unroll the loop and do all 32.
> +%assign iter 0
> +%rep 32/(mmsize/4)
> + movu m0, [smpq+iter]
> + movu [resq+iter], m0
> + %assign iter iter+mmsize
> +%endrep
> +
> +lea resq, [resq+orderq*4]
> +lea smpq, [smpq+orderq*4]
> +lea coefsq, [coefsq+orderq*4]
> +sub lenmp, orderq
> +movd m3, shiftmp
> +neg orderq
[...]
> +void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
you are mixing 32bit function arguments (int) with 64bit reading from
them in the asm
the high 32bit could be non zero and cause a crash
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Good people do not need laws to tell them to act responsibly, while bad
people will find a way around the laws. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140215/3343a111/attachment.asc>
More information about the ffmpeg-devel
mailing list