[FFmpeg-devel] [PATCH 4/4] lavc/flacenc: partially unroll loop in flac_enc_lpc_32
James Darnley
james.darnley at gmail.com
Wed Mar 12 13:03:06 CET 2014
Now does 6 samples per iteration, up from 2.
>From 1.6 to 2.1 times faster again. 2.5 to 3.9 times faster overall.
Runtime is reduced by a further 4 to 17%. Reduced by 9 to 65% overall.
Same conditions as previously.
---
libavcodec/x86/flac_dsp_gpl.asm | 32 +++++++++++++++++++++++++++-----
1 files changed, 27 insertions(+), 5 deletions(-)
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index ff63539..c935c79 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -129,12 +129,12 @@ RET
INIT_XMM sse42
%if ARCH_X86_64
- cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs, posj, negj
+ cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs, posj, negj
%define length r2d
movsxd orderq, orderd
%else
- cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, posj, order, coefs, negj
+ cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, posj, order, coefs, negj
%define length r2mp
%endif
@@ -161,6 +161,8 @@ mova [rsp], m4 ; save sign extend mask
.looplen:
pxor m0, m0
+ pxor m4, m4
+ pxor m6, m6
mov posjq, orderq
xor negjq, negjq
@@ -168,23 +170,43 @@ mova [rsp], m4 ; save sign extend mask
movd m2, [coefsq+posjq*4] ; c = coefs[j]
SPLATD m2
movh m1, [smpq+negjq*4-4] ; s = smp[i-j-1]
+ movh m5, [smpq+negjq*4-4+mmsize/2]
+ movh m7, [smpq+negjq*4-4+mmsize]
pshufd m1, m1, q3130
+ pshufd m5, m5, q3130
+ pshufd m7, m7, q3130
pmuldq m1, m2
+ pmuldq m5, m2
+ pmuldq m7, m2
paddq m0, m1 ; p += c * s
+ paddq m4, m5
+ paddq m6, m7
dec negjq
inc posjq
jnz .looporder
HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
+ HACK_PSRAQ m4, m3, [rsp], m2
+ HACK_PSRAQ m6, m3, [rsp], m2
CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
+ CLIPQ m4, [pq_int_min], [pq_int_max], m2
+ CLIPQ m6, [pq_int_min], [pq_int_max], m2
pshufd m0, m0, q0020 ; pack into first 2 dwords
+ pshufd m4, m4, q0020
+ pshufd m6, m6, q0020
movh m1, [smpq]
+ movh m5, [smpq+mmsize/2]
+ movh m7, [smpq+mmsize]
psubd m1, m0 ; smp[i] - p
+ psubd m5, m4
+ psubd m7, m6
movh [resq], m1 ; res[i] = smp[i] - (p >> shift)
+ movh [resq+mmsize/2], m5
+ movh [resq+mmsize], m7
- add resq, mmsize/2
- add smpq, mmsize/2
- sub length, mmsize/8
+ add resq, (3*mmsize)/2
+ add smpq, (3*mmsize)/2
+ sub length, (3*mmsize)/8
jg .looplen
RET
--
1.7.9
More information about the ffmpeg-devel
mailing list