[FFmpeg-devel] [PATCH v2] aacenc: add SIMD optimizations for abs_pow34 and quantization
Michael Niedermayer
michael at niedermayer.cc
Sun Oct 9 18:04:26 EEST 2016
On Sun, Oct 09, 2016 at 01:15:44PM +0100, Rostislav Pehlivanov wrote:
> On 9 October 2016 at 03:18, Michael Niedermayer <michael at niedermayer.cc>
> wrote:
>
> > On Sat, Oct 08, 2016 at 06:42:28PM +0100, Rostislav Pehlivanov wrote:
> > > Performance improvements:
> > >
> > > quant_bands:
> > > with: 681 decicycles in quant_bands, 8388453 runs, 155 skips
> > > without: 1190 decicycles in quant_bands, 8388386 runs, 222 skips
> > > Around 42% for the function
> > >
> > > Twoloop coder:
> > >
> > > abs_pow34:
> > > with/without: 7.82s/8.17s
> > > Around 4% for the entire encoder
> > >
> > > Both:
> > > with/without: 7.15s/8.17s
> > > Around 12% for the entire encoder
> > >
> > > Fast coder:
> > >
> > > abs_pow34:
> > > with/without: 3.40s/3.77s
> > > Around 10% for the entire encoder
> > >
> > > Both:
> > > with/without: 3.02s/3.77s
> > > Around 20% faster for the entire encoder
> > >
> > > Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>
> > > ---
> > > libavcodec/aaccoder.c | 22 ++++----
> > > libavcodec/aaccoder_trellis.h | 2 +-
> > > libavcodec/aaccoder_twoloop.h | 2 +-
> > > libavcodec/aacenc.c | 4 ++
> > > libavcodec/aacenc.h | 6 +++
> > > libavcodec/aacenc_is.c | 6 +--
> > > libavcodec/aacenc_ltp.c | 4 +-
> > > libavcodec/aacenc_pred.c | 6 +--
> > > libavcodec/aacenc_quantization.h | 4 +-
> > > libavcodec/aacenc_utils.h | 4 +-
> > > libavcodec/x86/Makefile | 2 +
> > > libavcodec/x86/aacencdsp.asm | 108 ++++++++++++++++++++++++++++++
> > +++++++++
> > > libavcodec/x86/aacencdsp_init.c | 42 +++++++++++++++
> > > 13 files changed, 187 insertions(+), 25 deletions(-)
> > > create mode 100644 libavcodec/x86/aacencdsp.asm
> > > create mode 100644 libavcodec/x86/aacencdsp_init.c
> >
> > libavcodec/x86/aacencdsp.asm:67: error: expression syntax error
> > libavcodec/x86/aacencdsp.asm:79: warning: (RUN_AVX_INSTR:22) use of
> > ``movd'' sse2 instruction in sse function: ff_aac_quantize_bands_sse
> > libavcodec/x86/aacencdsp.asm:99: warning: (RUN_AVX_INSTR:22) use of
> > ``pand'' sse2 instruction in sse function: ff_aac_quantize_bands_sse
> > libavcodec/x86/aacencdsp.asm:103: warning: (RUN_AVX_INSTR:20) use of
> > ``cvttps2dq'' sse2 instruction in sse function: ff_aac_quantize_bands_sse
> >
> > yasm 1.2.0
> >
> > [...]
> > --
> > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> >
> > Breaking DRM is a little like attempting to break through a door even
> > though the window is wide open and the only thing in the house is a bunch
> > of things you dont want and which you would get tomorrow for free anyway
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel at ffmpeg.org
> > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> >
> Yes, discussed on IRC, fixed that yesterday. Attached the patch if you want
> to test.
> I'll push it tonight unless confirmed to not work on Windows (haven't
> tested but it should work according to what nevcairiel said).
> aaccoder.c | 22 ++++++------
> aaccoder_trellis.h | 2 -
> aaccoder_twoloop.h | 2 -
> aacenc.c | 4 ++
> aacenc.h | 6 +++
> aacenc_is.c | 6 +--
> aacenc_ltp.c | 4 +-
> aacenc_pred.c | 6 +--
> aacenc_quantization.h | 4 +-
> aacenc_utils.h | 4 +-
> x86/Makefile | 2 +
> x86/aacencdsp.asm | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++
> x86/aacencdsp_init.c | 43 ++++++++++++++++++++++++
> 13 files changed, 167 insertions(+), 25 deletions(-)
> 8f8dd9c1cca110b682dbb73cbae6643798336aec 0001-aacenc-add-SIMD-optimizations-for-abs_pow34-and-quan.patch
> From 3bc5622e5be67698d099a191ebfd297bf1eda7cd Mon Sep 17 00:00:00 2001
> From: Rostislav Pehlivanov <atomnuker at gmail.com>
> Date: Sat, 8 Oct 2016 15:59:14 +0100
> Subject: [PATCH] aacenc: add SIMD optimizations for abs_pow34 and quantization
this segfaults on x86-32
make fate-gaplessenc-itunes-to-ipod-aac V=2
...
Stream mapping:
Stream #0:0 -> #0:0 (aac (native) -> aac (native))
Segmentation fault
make: *** [fate-gaplessenc-itunes-to-ipod-aac] Error 1
Program received signal SIGSEGV, Segmentation fault.
ff_abs_pow34_sse () at src/libavcodec/x86/aacencdsp.asm:42
42 mova m0, [inq+sizeq]
(gdb) bt
Python Exception <type 'exceptions.ImportError'> No module named gdb.frames:
#0 ff_abs_pow34_sse () at src/libavcodec/x86/aacencdsp.asm:42
#1 0x08bfe132 in search_for_ms (s=0xf7c02020, cpe=0xf12e9020) at src/libavcodec/aaccoder.c:794
#2 0x089fc754 in aac_encode_frame (avctx=0x98ac900, avpkt=0x98cc000, frame=0x98e55a0, got_packet_ptr=0xffffcedc) at src/libavcodec/aacenc.c:735
#3 0x0877182d in avcodec_encode_audio2 (avctx=0x98ac900, avpkt=0x98cc000, frame=<optimized out>, got_packet_ptr=0xffffcedc) at src/libavcodec/utils.c:1886
#4 0x0877226f in do_encode (avctx=0x98ac900, frame=0x98e55a0, got_packet=0xffffcedc) at src/libavcodec/utils.c:2939
#5 0x08774287 in avcodec_send_frame (avctx=0x98ac900, frame=0x98e55a0) at src/libavcodec/utils.c:2985
#6 0x080ebbbb in do_audio_out (frame=0x98e55a0, ost=0x98ac760, of=0x98a8500) at src/ffmpeg.c:888
#7 reap_filters (flush=0) at src/ffmpeg.c:1460
#8 0x080f1ee5 in transcode_step () at src/ffmpeg.c:4343
#9 transcode () at src/ffmpeg.c:4387
#10 0x080ce57e in main (argc=<optimized out>, argv=<optimized out>) at src/ffmpeg.c:4592
disassemble $pc-32,$pc+32
Dump of assembler code from 0x8c42bcc to 0x8c42c0c:
0x08c42bcc: nop
0x08c42bcd: nop
0x08c42bce: nop
0x08c42bcf: nop
0x08c42bd0 <ff_abs_pow34_sse+0>: mov 0x4(%esp),%eax
0x08c42bd4 <ff_abs_pow34_sse+4>: mov 0x8(%esp),%ecx
0x08c42bd8 <ff_abs_pow34_sse+8>: mov 0xc(%esp),%edx
0x08c42bdc <ff_abs_pow34_sse+12>: movaps 0x9031ef0,%xmm2
0x08c42be3 <ff_abs_pow34_sse+19>: shl $0x2,%edx
0x08c42be6 <ff_abs_pow34_sse+22>: add %edx,%ecx
0x08c42be8 <ff_abs_pow34_sse+24>: add %edx,%eax
0x08c42bea <ff_abs_pow34_sse+26>: neg %edx
=> 0x08c42bec <ff_abs_pow34_sse+28>: movaps (%ecx,%edx,1),%xmm0
0x08c42bf0 <ff_abs_pow34_sse+32>: andps %xmm2,%xmm0
0x08c42bf3 <ff_abs_pow34_sse+35>: sqrtps %xmm0,%xmm1
0x08c42bf6 <ff_abs_pow34_sse+38>: mulps %xmm1,%xmm0
0x08c42bf9 <ff_abs_pow34_sse+41>: sqrtps %xmm0,%xmm0
0x08c42bfc <ff_abs_pow34_sse+44>: movaps %xmm0,(%eax,%edx,1)
0x08c42c00 <ff_abs_pow34_sse+48>: add $0x10,%edx
0x08c42c03 <ff_abs_pow34_sse+51>: jl 0x8c42bec <ff_abs_pow34_sse+28>
0x08c42c05 <ff_abs_pow34_sse+53>: repz ret
0x08c42c07 <ff_abs_pow34_sse.loop+27>: nopw 0x0(%eax,%eax,1)
eax 0xf7c0bbb0 -138363984
ecx 0xffffc4dc -15140
edx 0xfffffff0 -16
ebx 0xffffc6cc -14644
esp 0xffffc36c 0xffffc36c
ebp 0xf7c02020 0xf7c02020
esi 0xffffc4cc -15156
edi 0xf12fb320 -248532192
eip 0x8c42bec 0x8c42bec <ff_abs_pow34_sse+28>
eflags 0x10287 [ CF PF SF IF RF ]
cs 0x23 35
ss 0x2b 43
ds 0x2b 43
es 0x2b 43
fs 0x0 0
gs 0x63 99
st0 457.1953125 (raw 0x4007e499000000000000)
st1 13.3905181884765625 (raw 0x4002d63f900000000000)
st2 0 (raw 0x00000000000000000000)
st3 0 (raw 0x00000000000000000000)
st4 0 (raw 0x00000000000000000000)
st5 0.5 (raw 0x3ffe8000000000000000)
st6 0 (raw 0x00000000000000000000)
st7 0 (raw 0x00000000000000000000)
fctrl 0x37f 895
fstat 0x36 54
ftag 0xffff 65535
fiseg 0x0 0
fioff 0x8bfe115 146792725
foseg 0x0 0
fooff 0xffffc3e0 -15392
fop 0x0 0
mxcsr 0x1fa0 [ PE IM DM ZM OM UM PM ]
ymm0 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0, 0x0, 0x0, 0x3b, 0x0, 0x0, 0x0, 0x3b, 0x0, 0x0, 0x0, 0x3b, 0x0, 0x0, 0x0, 0x3b, 0x0 <repeats 16 times>}, v16_int16 = {0x0, 0x3b00, 0x0, 0x3b00, 0x0, 0x3b00, 0x0, 0x3b00, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x3b000000, 0x3b000000, 0x3b000000, 0x3b000000, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x3b0000003b000000, 0x3b0000003b000000, 0x0, 0x0}, v2_int128 = {0x3b0000003b0000003b0000003b000000, 0x00000000000000000000000000000000}}
ymm1 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x97, 0x90, 0xcf, 0x3e, 0x97, 0x90, 0xcf, 0x3e, 0x97, 0x90, 0xcf, 0x3e, 0x97, 0x90, 0xcf, 0x3e, 0x0 <repeats 16 times>}, v16_int16 = {0x9097, 0x3ecf, 0x9097, 0x3ecf, 0x9097, 0x3ecf, 0x9097,
0x3ecf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x3ecf9097, 0x3ecf9097, 0x3ecf9097, 0x3ecf9097, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x3ecf90973ecf9097, 0x3ecf90973ecf9097, 0x0, 0x0}, v2_int128 = {0x3ecf90973ecf90973ecf90973ecf9097, 0x00000000000000000000000000000000}}
ymm2 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x8000000000000000, 0x8000000000000000, 0x0, 0x0}, v32_int8 = {0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0x0 <repeats 16 times>}, v16_int16 = {0xffff, 0x7fff, 0xffff,
0x7fff, 0xffff, 0x7fff, 0xffff, 0x7fff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x7fffffff7fffffff, 0x7fffffff7fffffff, 0x0, 0x0}, v2_int128 = {0x7fffffff7fffffff7fffffff7fffffff,
0x00000000000000000000000000000000}}
ymm3 {v8_float = {0xc, 0xc, 0xc, 0xc, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x200000, 0x200000, 0x0, 0x0}, v32_int8 = {0x0, 0x0, 0x40, 0x41, 0x0, 0x0, 0x40, 0x41, 0x0, 0x0, 0x40, 0x41, 0x0, 0x0, 0x40, 0x41, 0x0 <repeats 16 times>}, v16_int16 = {0x0, 0x4140, 0x0, 0x4140, 0x0, 0x4140, 0x0, 0x4140, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x41400000, 0x41400000, 0x41400000, 0x41400000, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x4140000041400000, 0x4140000041400000, 0x0, 0x0}, v2_int128 = {0x41400000414000004140000041400000, 0x00000000000000000000000000000000}}
ymm4 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = {
0x00000000000000000000000000000000, 0x00000000000000000000000000000000}}
ymm5 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = {
0x00000000000000000000000000000000, 0x00000000000000000000000000000000}}
ymm6 {v8_float = {0x15, 0xffffffed, 0x12, 0xfffffff0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0xfffffffff9b86bf0, 0xfffffffffdb50378, 0x0, 0x0}, v32_int8 = {0x3e, 0xcb, 0xae, 0x41, 0x50, 0x1e, 0x99, 0xc1, 0x1c, 0x5b, 0x93, 0x41, 0xe4, 0x57, 0x82, 0xc1, 0x0 <repeats 16 times>}, v16_int16 = {0xcb3e,
0x41ae, 0x1e50, 0xc199, 0x5b1c, 0x4193, 0x57e4, 0xc182, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x41aecb3e, 0xc1991e50, 0x41935b1c, 0xc18257e4, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0xc1991e5041aecb3e, 0xc18257e441935b1c, 0x0, 0x0}, v2_int128 = {0xc18257e441935b1cc1991e5041aecb3e,
0x00000000000000000000000000000000}}
ymm7 {v8_float = {0xffffff72, 0x12, 0x8b, 0xfffffff0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x4d6c730, 0xfffffffffdb50378, 0x0, 0x0}, v32_int8 = {0x43, 0x1b, 0xe, 0xc3, 0x1c, 0x5b, 0x93, 0x41, 0x94, 0x18, 0xb, 0x43, 0xe4, 0x57, 0x82, 0xc1, 0x0 <repeats 16 times>}, v16_int16 = {0x1b43, 0xc30e, 0x5b1c,
0x4193, 0x1894, 0x430b, 0x57e4, 0xc182, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0xc30e1b43, 0x41935b1c, 0x430b1894, 0xc18257e4, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x41935b1cc30e1b43, 0xc18257e4430b1894, 0x0, 0x0}, v2_int128 = {0xc18257e4430b189441935b1cc30e1b43,
0x00000000000000000000000000000000}}
mm0 {uint64 = 0xe499000000000000, v2_int32 = {0x0, 0xe4990000}, v4_int16 = {0x0, 0x0, 0x0, 0xe499}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x99, 0xe4}}
mm1 {uint64 = 0xd63f900000000000, v2_int32 = {0x0, 0xd63f9000}, v4_int16 = {0x0, 0x0, 0x9000, 0xd63f}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x90, 0x3f, 0xd6}}
mm2 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}}
mm3 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}}
mm4 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}}
mm5 {uint64 = 0x8000000000000000, v2_int32 = {0x0, 0x80000000}, v4_int16 = {0x0, 0x0, 0x0, 0x8000}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80}}
mm6 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}}
mm7 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}}
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Those who are too smart to engage in politics are punished by being
governed by those who are dumber. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20161009/d63702dd/attachment.sig>
More information about the ffmpeg-devel
mailing list