[FFmpeg-devel] [PATCH 1/4] lavc/flacenc: add sse4 version of the 16-bit lpc encoder
Michael Niedermayer
michaelni at gmx.at
Sat Mar 15 00:01:08 CET 2014
On Wed, Mar 12, 2014 at 01:03:03PM +0100, James Darnley wrote:
> From 1.8 to 2.4 times faster. Runtime is reduced by 2 to 39%. The
> speed-up generally increases with compression_level.
>
> This lpc encoder is not used with levels < 3 so it provides no speed-up
> in these cases.
> ---
> LICENSE | 1 +
> libavcodec/flacenc.c | 2 +-
> libavcodec/x86/Makefile | 3 +
> libavcodec/x86/flac_dsp_gpl.asm | 79 +++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/flacdsp_init.c | 4 ++
> 5 files changed, 88 insertions(+), 1 deletions(-)
> create mode 100644 libavcodec/x86/flac_dsp_gpl.asm
>
> diff --git a/LICENSE b/LICENSE
> index 1f757aa..c194087 100644
> --- a/LICENSE
> +++ b/LICENSE
> @@ -15,6 +15,7 @@ Specifically, the GPL parts of FFmpeg are
> - libpostproc
> - libmpcodecs
> - optional x86 optimizations in the files
> + libavcodec/x86/flac_dsp_gpl.asm
> libavcodec/x86/idct_mmx.c
> - libutvideo encoding/decoding wrappers in
> libavcodec/libutvideo*.cpp
> diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
> index 1fc8c4c..e958cd8 100644
> --- a/libavcodec/flacenc.c
> +++ b/libavcodec/flacenc.c
> @@ -80,7 +80,7 @@ typedef struct FlacSubframe {
> int shift;
> RiceContext rc;
> int32_t samples[FLAC_MAX_BLOCKSIZE];
> - int32_t residual[FLAC_MAX_BLOCKSIZE+1];
> + int32_t residual[FLAC_MAX_BLOCKSIZE+3];
> } FlacSubframe;
>
> typedef struct FlacFrame {
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 0d3594f..374b1d2 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -76,6 +76,9 @@ YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
> YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
> YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
> YASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
> +ifdef CONFIG_GPL
> +YASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
> +endif
> YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
> YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
> x86/h264_chromamc_10bit.o
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
> new file mode 100644
> index 0000000..cc07194
> --- /dev/null
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -0,0 +1,79 @@
> +;*****************************************************************************
> +;* FLAC DSP functions
> +;*
> +;* Copyright (c) 2014 James Darnley <james.darnley at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or modify
> +;* it under the terms of the GNU General Public License as published by
> +;* the Free Software Foundation; either version 2 of the License, or
> +;* (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +;* GNU General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU General Public License along
> +;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_TEXT
> +
> +INIT_XMM sse4
> +%if ARCH_X86_64
> + cglobal flac_enc_lpc_16, 5, 7, 4, 0, res, smp, len, order, coefs, posj, negj
> + %define length r2d
> +
> + movsxd orderq, orderd
> +%else
> + cglobal flac_enc_lpc_16, 5, 6, 4, 0, res, smp, posj, order, coefs, negj
> + %define length r2mp
> +%endif
> +
> +; Is it worth looping correctly over the first samples? The most that ever need
> +; to be copied is 32 so we might as well just unroll the loop and do all 32.
implementations should not make assumtations on their use except
what is documented in the API
or the other way around
if some limitation is always true and you want to write an
implementation that takes advantage of the limitation for optimization
then this limitation should be documented in the API first
(in this case of FLACDSPContext / lpc_encode)
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Breaking DRM is a little like attempting to break through a door even
though the window is wide open and the only thing in the house is a bunch
of things you dont want and which you would get tomorrow for free anyway
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140315/4a77e55d/attachment.asc>
More information about the ffmpeg-devel
mailing list