[FFmpeg-devel] [PATCH 06/10] lavu/aes: add x86 AESNI optimizations
Henrik Gramner
henrik at gramner.com
Tue Oct 13 22:44:15 CEST 2015
I changed the asm a bit and made it about 1 cycle faster on Haswell
and slightly smaller (-48 bytes overall incl. alignment on 64-bit
Linux).
%macro AES_CRYPT 1
cglobal aes_%1rypt, 6,6,2
shl r3d, 4
add r5d, r5d
add r0, 0x60
add r2, r3
add r1, r3
neg r3
pxor m1, m1
test r4, r4
je .block
movu m1, [r4] ; iv
.block:
movu m0, [r2+r3] ; state
%ifidn %1, enc
pxor m0, m1
%endif
pxor m0, [r0+8*r5-0x60]
cmp r5d, 24
je .rounds12
jl .rounds10
aes%1 m0, [r0+0x70]
aes%1 m0, [r0+0x60]
.rounds12:
aes%1 m0, [r0+0x50]
aes%1 m0, [r0+0x40]
.rounds10:
aes%1 m0, [r0+0x30]
aes%1 m0, [r0+0x20]
aes%1 m0, [r0+0x10]
aes%1 m0, [r0+0x00]
aes%1 m0, [r0-0x10]
aes%1 m0, [r0-0x20]
aes%1 m0, [r0-0x30]
aes%1 m0, [r0-0x40]
aes%1 m0, [r0-0x50]
aes%1last m0, [r0-0x60]
test r4, r4
je .noiv
%ifidn %1, enc
mova m1, m0
%else
pxor m0, m1
movu m1, [r2+r3]
%endif
.noiv
movu [r1+r3], m0
add r3, 16
jl .block
%ifidn %1, enc
test r4, r4
je .ret
movu [r4], m0
.ret:
%endif
REP_RET
%endmacro
%if HAVE_AESNI_EXTERNAL
INIT_XMM aesni
AES_CRYPT enc
AES_CRYPT dec
%endif
More information about the ffmpeg-devel
mailing list