[FFmpeg-devel] [WIP] add sse4 flac lpc encoder
James Darnley
james.darnley at gmail.com
Mon Feb 3 02:18:00 CET 2014
A rather hacked together patch adding an sse4 version of the flac lpc
encoder for 16-bit samples, flac_lpc_encode_c_16(). But it works correctly.
I have been using gprof to measure the time taken in functions.
> Each sample counts as 0.01 seconds.
> % cumulative self self total
> time seconds seconds calls ms/call ms/call name
Original code:
> 43.94 19.45 19.45 flac_lpc_encode_c_16
This patch:
> 25.74 17.10 8.54 ff_flac_enc_lpc_16_sse4
The fraction of total time is down from nearly half to just over a
quarter. The time reported by `time` is also less these ~12 seconds.
Original: 0m52.318s
Patch: 0m40.198s
These tests were done with compression level 8 which does skew the time
spent in these functions to be in my favour.
I already see that I can use 4 more xmm regs to unroll the loop more.
-------------- next part --------------
From 4c8c95931aa39cf6189b7efd504134ea080b8952 Mon Sep 17 00:00:00 2001
From: James Darnley <james.darnley at gmail.com>
Date: Sun, 2 Feb 2014 17:07:41 +0100
Subject: [PATCH 1/3] WIP add sse4 flac lpc encoder
---
libavcodec/flacdsp.c | 25 ++++++++++++++-
libavcodec/x86/Makefile | 2 +
libavcodec/x86/flac_dsp.asm | 71 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 97 insertions(+), 1 deletions(-)
create mode 100644 libavcodec/x86/flac_dsp.asm
diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
index 02eba3e..8fae578 100644
--- a/libavcodec/flacdsp.c
+++ b/libavcodec/flacdsp.c
@@ -26,7 +26,6 @@
#define SAMPLE_SIZE 16
#define PLANAR 0
#include "flacdsp_template.c"
-#include "flacdsp_lpc_template.c"
#undef PLANAR
#define PLANAR 1
@@ -43,6 +42,30 @@
#define PLANAR 1
#include "flacdsp_template.c"
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, const int32_t *, int, int, int);
+
+static void flac_lpc_encode_c_16(int32_t *res, const int32_t *smp, int len,
+ int order, const int32_t *coefs, int shift)
+{
+ int i;
+ for (i = 0; i < order; i++)
+ res[i] = smp[i];
+ /*for (i = order; i < len; i += 2) {
+ int j;
+ int s = smp[i];
+ int32_t p0 = 0, p1 = 0;
+ for (j = 0; j < order; j++) {
+ int c = coefs[j];
+ p1 += (c*s);
+ s = smp[i-j-1];
+ p0 += (c*s);
+ }
+ res[i ] = smp[i ] - (p0 >> shift);
+ res[i+1] = smp[i+1] - (p1 >> shift);
+ }*/
+ ff_flac_enc_lpc_16_sse4(res+order, smp+order, coefs, len-order, order, shift);
+}
+
static void flac_lpc_16_c(int32_t *decoded, const int coeffs[32],
int pred_order, int qlevel, int len)
{
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index dddaae1..5c69e3e 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -107,3 +107,5 @@ YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9itxfm.o \
x86/vp9lpf.o \
x86/vp9mc.o
YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
+
+YASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp.o
diff --git a/libavcodec/x86/flac_dsp.asm b/libavcodec/x86/flac_dsp.asm
new file mode 100644
index 0000000..5a9a24c
--- /dev/null
+++ b/libavcodec/x86/flac_dsp.asm
@@ -0,0 +1,71 @@
+;*****************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+; The C code for 16-bit samples simplifies to this
+
+;for (i = order; i < len; i++) {
+; int j;
+; int32_t p = 0;
+; for (j = 0; j < order; j++) {
+; int c = coefs[j];
+; int s = smp[i-j-1];
+; p += (c*s);
+; }
+; res[i] = smp[i] - (p >> shift);
+;}
+
+INIT_XMM sse4
+cglobal flac_enc_lpc_16, 3, 5, 4, 0, res, smp, coefs ; len, order, shift
+ ; r0 r1 r2 r3 r4 r5
+
+%define posj r3
+%define negj r4
+
+movd m3, r5m ; shift
+loop_len:
+ pxor m0, m0
+ xor posj, posj
+ xor negj, negj
+ loop_order:
+ movd m2, [coefsq+posj*4] ; c = coefs[j]
+ SPLATD m2
+ movu m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+ pmulld m1, m2
+ paddd m0, m1 ; p += c * s
+
+ add posj, 1
+ sub negj, 1
+ cmp posj, r4m
+ jne loop_order
+
+ psrad m0, m3 ; p >>= shift
+ movu m1, [smpq]
+ psubd m1, m0 ; smp[i] - p
+ movu [resq], m1 ; res[i] = smp[i] - (p >> shift)
+
+ add resq, mmsize
+ add smpq, mmsize
+ sub DWORD r3m, mmsize/4
+jg loop_len
+RET
--
1.7.9
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 683 bytes
Desc: OpenPGP digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140203/4c0d5b8d/attachment.asc>
More information about the ffmpeg-devel
mailing list