[FFmpeg-devel] [PATCH] Port x264 SSE2 deblocking code to H.264 decoder
Mike Melanson
mike
Sat Dec 20 04:18:16 CET 2008
Jason Garrett-Glaser wrote:
> On Thu, Dec 18, 2008 at 8:28 PM, Jason Garrett-Glaser
> <darkshikari at gmail.com> wrote:
>> On Thu, Dec 18, 2008 at 5:34 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
>>> On Thu, Dec 18, 2008 at 05:22:51PM -0800, Jason Garrett-Glaser wrote:
>>>> On Thu, Dec 18, 2008 at 5:09 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
>>>>> On Thu, Dec 18, 2008 at 04:47:24PM -0800, Jason Garrett-Glaser wrote:
>>>>>> OK, now we have luma_intra in DSPutil, so this should be easier.
>>>>>>
>>>>>> Michael: how should we rename the x264 functions? My thought was just
>>>>>> to s/x264/ff_h264/ or something of the sort, which would modify the
>>>>>> code from x264's version but make it trivial to modify the code before
>>>>>> committing any updates from x264. It wouldn't need ugly #defines
>>>>>> either.
>>>>> didint loren post some patch that changed cglobal to add a prefix
>>>>> automagically ...
>>>> Shouldn't that be a separate patch? If it's fine, can I commit his
>>>> patch now then?
>>> ive approved his patch, so yes you can apply it of course
>> applied (sorry for forgetting credit, forgot to add Loren's name to
>> commit message).
>>
>> Updated x264 deblock patch attached.
>>
>> Dark Shikari
>>
>
> Small error in patch fixed.
In case you did not see my response on ffmpeg-cvslog, this code causes
the H.264 decoder to segfault (on about 90 of the conformance samples)
when compiled with Intel's C Compiler, which we are sort of trying to
support. I had time to look at it a little more but did not find much
more useful information. I know that a build from a straight
'./configure --cc="icc"' config is fine, but not if --enable-gpl is also
specified.
Here comes the info dump, using the sample from this test spec:
http://fate.multimedia.cx/index.php?test_spec=8
~/build-icc$ gdb ./ffmpeg_g
GNU gdb 6.8-debian
Copyright (C) 2008 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later
<http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "i486-linux-gnu"...
(gdb) r -i /mnt/fate-suite/h264-conformance/BA1_FT_C.264 -f framecrc -
Starting program: /home/melanson/build-icc/ffmpeg_g -i
/mnt/fate-suite/h264-conformance/BA1_FT_C.264 -f framecrc -
FFmpeg version SVN-r16243, Copyright (c) 2000-2008 Fabrice Bellard, et al.
configuration: --cc=/opt/intel/cc/10.1.015/bin/icc --enable-gpl
libavutil 49.12. 0 / 49.12. 0
libavcodec 52. 7. 0 / 52. 7. 0
libavformat 52.23. 1 / 52.23. 1
libavdevice 52. 1. 0 / 52. 1. 0
built on Dec 19 2008 18:49:39, gcc: Intel(R) C++ gcc 4.2 mode
Program received signal SIGSEGV, Segmentation fault.
ff_x264_deblock_v_luma_intra_sse2 ()
at /home/melanson/fate/source/libavcodec/i386/h264_deblock_sse2.asm:743
743 DEBLOCK_LUMA_INTRA sse2, v
Current language: auto; currently asm
(gdb) bt
#0 ff_x264_deblock_v_luma_intra_sse2 ()
at /home/melanson/fate/source/libavcodec/i386/h264_deblock_sse2.asm:743
#1 0xbfe773b0 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) disass $pc-32 $pc+32
Dump of assembler code from 0x838e9d2 to 0x838ea12:
0x0838e9d2 <ff_x264_deblock_v_luma_intra_sse2+18>: sbb $0x83,%al
0x0838e9d4 <ff_x264_deblock_v_luma_intra_sse2+20>: in (%dx),%al
0x0838e9d5 <ff_x264_deblock_v_luma_intra_sse2+21>: pusha
0x0838e9d6 <ff_x264_deblock_v_luma_intra_sse2+22>: lea 0x0(,%ecx,4),%esi
0x0838e9dd <ff_x264_deblock_v_luma_intra_sse2+29>: lea (%ecx,%ecx,2),%edi
0x0838e9e0 <ff_x264_deblock_v_luma_intra_sse2+32>: dec %edx
0x0838e9e1 <ff_x264_deblock_v_luma_intra_sse2+33>: jl 0x838ed6e
<ff_x264_deblock_v_luma_intra_sse2.end>
0x0838e9e7 <ff_x264_deblock_v_luma_intra_sse2+39>: neg %esi
0x0838e9e9 <ff_x264_deblock_v_luma_intra_sse2+41>: dec %ebx
0x0838e9ea <ff_x264_deblock_v_luma_intra_sse2+42>: jl 0x838ed6e
<ff_x264_deblock_v_luma_intra_sse2.end>
0x0838e9f0 <ff_x264_deblock_v_luma_intra_sse2+48>: add %eax,%esi
0x0838e9f2 <ff_x264_deblock_v_luma_intra_sse2+50>: movaps
(%esi,%ecx,2),%xmm0
0x0838e9f6 <ff_x264_deblock_v_luma_intra_sse2+54>: movaps
(%esi,%edi,1),%xmm1
0x0838e9fa <ff_x264_deblock_v_luma_intra_sse2+58>: movaps (%eax),%xmm2
0x0838e9fd <ff_x264_deblock_v_luma_intra_sse2+61>: movaps
(%eax,%ecx,1),%xmm3
0x0838ea01 <ff_x264_deblock_v_luma_intra_sse2+65>: movd %edx,%xmm4
0x0838ea05 <ff_x264_deblock_v_luma_intra_sse2+69>: movd %ebx,%xmm5
0x0838ea09 <ff_x264_deblock_v_luma_intra_sse2+73>: pshuflw $0x0,%xmm4,%xmm4
0x0838ea0e <ff_x264_deblock_v_luma_intra_sse2+78>: punpcklqdq %xmm4,%xmm4
End of assembler dump.
(gdb) info all-registers
eax 0xbfe77458 -1075350440
ecx 0x10 16
edx 0x3 3
ebx 0x1 1
esp 0xbfe77398 0xbfe77398
ebp 0x89351c0 0x89351c0
esi 0xbfe77418 -1075350504
edi 0x30 48
eip 0x838e9f2 0x838e9f2 <ff_x264_deblock_v_luma_intra_sse2+50>
eflags 0x210287 [ CF PF SF IF RF ID ]
cs 0x73 115
ss 0x7b 123
ds 0x7b 123
es 0x7b 123
fs 0x0 0
gs 0x33 51
st0 -nan(0xdcb1687069778689) (raw 0xffffdcb1687069778689)
st1 -nan(0xd9d7d69cd9d6d572) (raw 0xffffd9d7d69cd9d6d572)
st2 -nan(0xd7d6ad67d6da7a6b) (raw 0xffffd7d6ad67d6da7a6b)
st3 -nan(0xda786a706e7d8589) (raw 0xffffda786a706e7d8589)
st4 -nan(0xd7d6ad6773707385) (raw 0xffffd7d6ad6773707385)
st5 -nan(0xd9d7d69c6778717a) (raw 0xffffd9d7d69c6778717a)
st6 -nan(0xd9d6d57273736a7d) (raw 0xffffd9d6d57273736a7d)
st7 -nan(0xd6da7a6b7175838b) (raw 0xffffd6da7a6b7175838b)
fctrl 0x37f 895
fstat 0x120 288
ftag 0xaaaa 43690
fiseg 0x73 115
fioff 0x80f1447 135205959
foseg 0x7b 123
fooff 0x88d58ac 143481004
fop 0x11c 284
xmm0 {v4_float = {0x0, 0x0, 0x0, 0x0}, v2_double = {
0x8000000000000000, 0x8000000000000000}, v16_int8 = {
0x7f <repeats 16 times>}, v8_int16 = {0x7f7f, 0x7f7f, 0x7f7f, 0x7f7f,
0x7f7f, 0x7f7f, 0x7f7f, 0x7f7f}, v4_int32 = {0x7f7f7f7f, 0x7f7f7f7f,
0x7f7f7f7f, 0x7f7f7f7f}, v2_int64 = {0x7f7f7f7f7f7f7f7f,
0x7f7f7f7f7f7f7f7f}, uint128 = 0x7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f}
xmm1 {v4_float = {0x0, 0x0, 0x0, 0x0}, v2_double = {
0x8000000000000000, 0x8000000000000000}, v16_int8 = {
0x7f <repeats 16 times>}, v8_int16 = {0x7f7f, 0x7f7f, 0x7f7f, 0x7f7f,
0x7f7f, 0x7f7f, 0x7f7f, 0x7f7f}, v4_int32 = {0x7f7f7f7f, 0x7f7f7f7f,
0x7f7f7f7f, 0x7f7f7f7f}, v2_int64 = {0x7f7f7f7f7f7f7f7f,
0x7f7f7f7f7f7f7f7f}, uint128 = 0x7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f}
xmm2 {v4_float = {0x0, 0x0, 0x0, 0xffffe11d}, v2_double = {0x0,
0x8000000000000000}, v16_int8 = {0xdc, 0xa8, 0x2e, 0x6, 0xa9, 0xe0,
0x81,
0x88, 0xad, 0x30, 0xc4, 0x71, 0x5b, 0x1f, 0xf7, 0xc5}, v8_int16 =
{0xa8dc,
0x62e, 0xe0a9, 0x8881, 0x30ad, 0x71c4, 0x1f5b, 0xc5f7}, v4_int32 = {
0x62ea8dc, 0x8881e0a9, 0x71c430ad, 0xc5f71f5b}, v2_int64 = {
0x8881e0a9062ea8dc, 0xc5f71f5b71c430ad},
uint128 = 0xc5f71f5b71c430ad8881e0a9062ea8dc}
xmm3 {v4_float = {0x0, 0x0, 0x2c8ea00, 0x0}, v2_double = {
0x8000000000000000, 0x8000000000000000}, v16_int8 = {0x30, 0x1d, 0x4c,
0x24, 0x7a, 0x96, 0x2a, 0xdf, 0x8b, 0x9b, 0xfe, 0xcf, 0x71, 0x84,
0x1c,
0xdb}, v8_int16 = {0x1d30, 0x244c, 0x967a, 0xdf2a, 0x9b8b, 0xcffe,
0x8471,
0xdb1c}, v4_int32 = {0x244c1d30, 0xdf2a967a, 0xcffe9b8b, 0xdb1c8471},
v2_int64 = {0xdf2a967a244c1d30, 0xdb1c8471cffe9b8b},
uint128 = 0xdb1c8471cffe9b8bdf2a967a244c1d30}
xmm4 {v4_float = {0x0, 0x0, 0x0, 0x0}, v2_double = {0x0,
0x8000000000000000}, v16_int8 = {0xc4, 0x6c, 0xff, 0x0, 0x28, 0xda,
0xc9,
0x9c, 0xf9, 0xdc, 0x57, 0x71, 0x1, 0x8, 0xbf, 0xeb}, v8_int16 =
{0x6cc4,
0xff, 0xda28, 0x9cc9, 0xdcf9, 0x7157, 0x801, 0xebbf}, v4_int32 = {
0xff6cc4, 0x9cc9da28, 0x7157dcf9, 0xebbf0801}, v2_int64 = {
0x9cc9da2800ff6cc4, 0xebbf08017157dcf9},
uint128 = 0xebbf08017157dcf99cc9da2800ff6cc4}
xmm5 {v4_float = {0x0, 0x2, 0x0, 0x0}, v2_double = {0x13, 0x0},
v16_int8 = {0x2b, 0x70, 0x1c, 0x7e, 0x2e, 0x5f, 0x33, 0x40, 0x74, 0x7a,
0x4a, 0x76, 0x94, 0x7a, 0x2f, 0x35}, v8_int16 = {0x702b, 0x7e1c,
0x5f2e,
0x4033, 0x7a74, 0x764a, 0x7a94, 0x352f}, v4_int32 = {0x7e1c702b,
0x40335f2e, 0x764a7a74, 0x352f7a94}, v2_int64 = {0x40335f2e7e1c702b,
0x352f7a94764a7a74}, uint128 = 0x352f7a94764a7a7440335f2e7e1c702b}
xmm6 {v4_float = {0x0, 0x0, 0x0, 0x0}, v2_double = {
0x8000000000000000, 0x8000000000000000}, v16_int8 = {0x0, 0xcf, 0x34,
0x17, 0x68, 0x0, 0xaf, 0xed, 0x9c, 0x84, 0x55, 0x5f, 0xc0, 0x8f, 0x7c,
0xf1}, v8_int16 = {0xcf00, 0x1734, 0x68, 0xedaf, 0x849c, 0x5f55,
0x8fc0,
0xf17c}, v4_int32 = {0x1734cf00, 0xedaf0068, 0x5f55849c, 0xf17c8fc0},
v2_int64 = {0xedaf00681734cf00, 0xf17c8fc05f55849c},
uint128 = 0xf17c8fc05f55849cedaf00681734cf00}
xmm7 {v4_float = {0x6, 0xfe240240, 0x0, 0x69b}, v2_double = {
0x8000000000000000, 0x8000000000000000}, v16_int8 = {0x34, 0xc, 0xd3,
0x40, 0xe0, 0xfe, 0xed, 0xcb, 0x1, 0x73, 0x3c, 0x67, 0x91, 0x60, 0xd3,
0x44}, v8_int16 = {0xc34, 0x40d3, 0xfee0, 0xcbed, 0x7301, 0x673c,
0x6091,
0x44d3}, v4_int32 = {0x40d30c34, 0xcbedfee0, 0x673c7301, 0x44d36091},
v2_int64 = {0xcbedfee040d30c34, 0x44d36091673c7301},
uint128 = 0x44d36091673c7301cbedfee040d30c34}
mxcsr 0x9fe0 [ PE DAZ IM DM ZM OM UM PM FZ ]
mm0 {uint64 = 0xdcb1687069778689, v2_int32 = {0x69778689,
0xdcb16870}, v4_int16 = {0x8689, 0x6977, 0x6870, 0xdcb1}, v8_int8 =
{0x89,
0x86, 0x77, 0x69, 0x70, 0x68, 0xb1, 0xdc}}
mm1 {uint64 = 0xd9d7d69cd9d6d572, v2_int32 = {0xd9d6d572,
0xd9d7d69c}, v4_int16 = {0xd572, 0xd9d6, 0xd69c, 0xd9d7}, v8_int8 =
{0x72,
0xd5, 0xd6, 0xd9, 0x9c, 0xd6, 0xd7, 0xd9}}
mm2 {uint64 = 0xd7d6ad67d6da7a6b, v2_int32 = {0xd6da7a6b,
0xd7d6ad67}, v4_int16 = {0x7a6b, 0xd6da, 0xad67, 0xd7d6}, v8_int8 =
{0x6b,
0x7a, 0xda, 0xd6, 0x67, 0xad, 0xd6, 0xd7}}
mm3 {uint64 = 0xda786a706e7d8589, v2_int32 = {0x6e7d8589,
0xda786a70}, v4_int16 = {0x8589, 0x6e7d, 0x6a70, 0xda78}, v8_int8 =
{0x89,
0x85, 0x7d, 0x6e, 0x70, 0x6a, 0x78, 0xda}}
mm4 {uint64 = 0xd7d6ad6773707385, v2_int32 = {0x73707385,
0xd7d6ad67}, v4_int16 = {0x7385, 0x7370, 0xad67, 0xd7d6}, v8_int8 =
{0x85,
0x73, 0x70, 0x73, 0x67, 0xad, 0xd6, 0xd7}}
mm5 {uint64 = 0xd9d7d69c6778717a, v2_int32 = {0x6778717a,
0xd9d7d69c}, v4_int16 = {0x717a, 0x6778, 0xd69c, 0xd9d7}, v8_int8 =
{0x7a,
0x71, 0x78, 0x67, 0x9c, 0xd6, 0xd7, 0xd9}}
mm6 {uint64 = 0xd9d6d57273736a7d, v2_int32 = {0x73736a7d,
0xd9d6d572}, v4_int16 = {0x6a7d, 0x7373, 0xd572, 0xd9d6}, v8_int8 =
{0x7d,
0x6a, 0x73, 0x73, 0x72, 0xd5, 0xd6, 0xd9}}
mm7 {uint64 = 0xd6da7a6b7175838b, v2_int32 = {0x7175838b,
0xd6da7a6b}, v4_int16 = {0x838b, 0x7175, 0x7a6b, 0xd6da}, v8_int8 =
{0x8b,
0x83, 0x75, 0x71, 0x6b, 0x7a, 0xda, 0xd6}}
--
-Mike Melanson
More information about the ffmpeg-devel
mailing list