[FFmpeg-devel] [PATCH 4/4] lavc/flacenc: partially unroll loop in flac_enc_lpc_32

James Darnley james.darnley at gmail.com
Wed Mar 12 13:03:06 CET 2014


Now does 6 samples per iteration, up from 2.

>From 1.6 to 2.1 times faster again.  2.5 to 3.9 times faster overall.
Runtime is reduced by a further 4 to 17%.  Reduced by 9 to 65% overall.

Same conditions as previously.
---
 libavcodec/x86/flac_dsp_gpl.asm |   32 +++++++++++++++++++++++++++-----
 1 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index ff63539..c935c79 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -129,12 +129,12 @@ RET
 
 INIT_XMM sse42
 %if ARCH_X86_64
-    cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs, posj, negj
+    cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs, posj, negj
     %define length r2d
 
     movsxd orderq, orderd
 %else
-    cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, posj, order, coefs, negj
+    cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, posj, order, coefs, negj
     %define length r2mp
 %endif
 
@@ -161,6 +161,8 @@ mova  [rsp],    m4            ; save sign extend mask
 
 .looplen:
     pxor m0,    m0
+    pxor m4,    m4
+    pxor m6,    m6
     mov  posjq, orderq
     xor  negjq, negjq
 
@@ -168,23 +170,43 @@ mova  [rsp],    m4            ; save sign extend mask
         movd   m2,  [coefsq+posjq*4] ; c = coefs[j]
         SPLATD m2
         movh   m1,  [smpq+negjq*4-4] ; s = smp[i-j-1]
+        movh   m5,  [smpq+negjq*4-4+mmsize/2]
+        movh   m7,  [smpq+negjq*4-4+mmsize]
         pshufd m1,   m1, q3130
+        pshufd m5,   m5, q3130
+        pshufd m7,   m7, q3130
         pmuldq m1,   m2
+        pmuldq m5,   m2
+        pmuldq m7,   m2
         paddq  m0,   m1              ; p += c * s
+        paddq  m4,   m5
+        paddq  m6,   m7
 
         dec    negjq
         inc    posjq
     jnz .looporder
 
     HACK_PSRAQ m0, m3, [rsp], m2     ; p >>= shift
+    HACK_PSRAQ m4, m3, [rsp], m2
+    HACK_PSRAQ m6, m3, [rsp], m2
     CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
+    CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
+    CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
     pshufd  m0,    m0, q0020 ; pack into first 2 dwords
+    pshufd  m4,    m4, q0020
+    pshufd  m6,    m6, q0020
     movh    m1,   [smpq]
+    movh    m5,   [smpq+mmsize/2]
+    movh    m7,   [smpq+mmsize]
     psubd   m1,    m0                ; smp[i] - p
+    psubd   m5,    m4
+    psubd   m7,    m6
     movh   [resq], m1                ; res[i] = smp[i] - (p >> shift)
+    movh   [resq+mmsize/2], m5
+    movh   [resq+mmsize], m7
 
-    add resq,   mmsize/2
-    add smpq,   mmsize/2
-    sub length, mmsize/8
+    add resq,   (3*mmsize)/2
+    add smpq,   (3*mmsize)/2
+    sub length, (3*mmsize)/8
 jg .looplen
 RET
-- 
1.7.9



More information about the ffmpeg-devel mailing list