[FFmpeg-devel] [PATCH 05/10] lavc/flacenc: partially unroll loop in flac_enc_lpc_16_sse4

James Darnley james.darnley at gmail.com
Wed Feb 12 00:11:17 CET 2014


It now does 12 samples per iteration, up from 4.

>From 1.8 to 3.2 times faster again.  3.6 to 5.7 times faster overall.
Runtime is reduced by a further 2 to 18%.  Overall runtime reduced by
4 to 50%.

Same conditions as before apply.
---
 libavcodec/x86/flac_dsp_gpl.asm |   26 +++++++++++++++++++++-----
 1 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index 9498c4c..59baba8 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -24,11 +24,11 @@
 
 INIT_XMM sse4
 %if ARCH_X86_64
-    cglobal flac_enc_lpc_16, 6, 8, 4, 0, res, smp, len, order, coefs, shift
+    cglobal flac_enc_lpc_16, 6, 8, 8, 0, res, smp, len, order, coefs, shift
     %define posj r6
     %define negj r7
 %else
-    cglobal flac_enc_lpc_16, 6, 6, 4, 0, res, smp, len, order, coefs, shift
+    cglobal flac_enc_lpc_16, 6, 6, 8, 0, res, smp, len, order, coefs, shift
     %define posj r2
     %define negj r5
 %endif
@@ -49,14 +49,22 @@ movd m3, shiftmp
 
 .looplen:
     pxor m0,  m0
+    pxor m4,  m4
+    pxor m6,  m6
     xor posj, posj
     xor negj, negj
     .looporder:
         movd   m2, [coefsq+posj*4] ; c = coefs[j]
         SPLATD m2
         movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
         pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
         paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
 
         add posj, 1
         sub negj, 1
@@ -64,12 +72,20 @@ movd m3, shiftmp
     jne .looporder
 
     psrad m0, m3                   ; p >>= shift
+    psrad m4, m3
+    psrad m6, m3
     movu  m1, [smpq]
+    movu  m5, [smpq+mmsize]
+    movu  m7, [smpq+mmsize*2]
     psubd m1, m0                   ; smp[i] - p
+    psubd m5, m4
+    psubd m7, m6
     movu  [resq], m1               ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
 
-    add resq, mmsize
-    add smpq, mmsize
-    sub lenmp, mmsize/4
+    add resq, 3*mmsize
+    add smpq, 3*mmsize
+    sub lenmp, (3*mmsize)/4
 jg .looplen
 RET
-- 
1.7.9



More information about the ffmpeg-devel mailing list