[FFmpeg-devel] [PATCH 2/4] lavc/flacenc: partially unroll loop in flac_enc_lpc_16

Wed Mar 12 13:03:04 CET 2014

It now does 12 samples per iteration, up from 4.

>From 1.8 to 3.2 times faster again.  3.6 to 5.7 times faster overall.
Runtime is reduced by a further 2 to 18%.  Overall runtime reduced by
4 to 50%.

Same conditions as before apply.
---
 libavcodec/flacenc.c            |    2 +-
 libavcodec/x86/flac_dsp_gpl.asm |   26 +++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index e958cd8..88f57e6 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -80,7 +80,7 @@ typedef struct FlacSubframe {
     int shift;
     RiceContext rc;
     int32_t samples[FLAC_MAX_BLOCKSIZE];
-    int32_t residual[FLAC_MAX_BLOCKSIZE+3];
+    int32_t residual[FLAC_MAX_BLOCKSIZE+11];
 } FlacSubframe;
 
 typedef struct FlacFrame {
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index cc07194..326ba12 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -26,12 +26,12 @@ SECTION_TEXT
 
 INIT_XMM sse4
 %if ARCH_X86_64
-    cglobal flac_enc_lpc_16, 5, 7, 4, 0, res, smp, len, order, coefs, posj, negj
+    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs, posj, negj
     %define length r2d
 
     movsxd orderq, orderd
 %else
-    cglobal flac_enc_lpc_16, 5, 6, 4, 0, res, smp, posj, order, coefs, negj
+    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, posj, order, coefs, negj
     %define length r2mp
 %endif
 
@@ -53,6 +53,8 @@ neg  orderq
 
 .looplen:
     pxor m0,    m0
+    pxor m4,    m4
+    pxor m6,    m6
     mov  posjq, orderq
     xor  negjq, negjq
 
@@ -60,20 +62,34 @@ neg  orderq
         movd   m2, [coefsq+posjq*4] ; c = coefs[j]
         SPLATD m2
         movu   m1, [smpq+negjq*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negjq*4-4+mmsize]
+        movu   m7, [smpq+negjq*4-4+mmsize*2]
         pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
         paddd  m0,  m1              ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
 
         dec    negjq
         inc    posjq
     jnz .looporder
 
     psrad  m0,     m3               ; p >>= shift
+    psrad  m4,     m3
+    psrad  m6,     m3
     movu   m1,    [smpq]
+    movu   m5,    [smpq+mmsize]
+    movu   m7,    [smpq+mmsize*2]
     psubd  m1,     m0               ; smp[i] - p
+    psubd  m5,     m4
+    psubd  m7,     m6
     movu  [resq],  m1               ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
 
-    add resq,   mmsize
-    add smpq,   mmsize
-    sub length, mmsize/4
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
 jg .looplen
 RET
-- 
1.7.9