[FFmpeg-devel] [PATCH] h264: integrate clear_blocks calls with IDCT.
Ronald S. Bultje
rsbultje at gmail.com
Mon Feb 18 04:47:22 CET 2013
Hi,
On Sun, Feb 17, 2013 at 6:04 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> On Sun, Feb 17, 2013 at 02:52:54PM -0800, Ronald S. Bultje wrote:
>> From: "Ronald S. Bultje" <rsbultje at gmail.com>
>>
>> The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700
>> to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb
>> (in the decode_slice loop) goes from 1759 to 1733 cycles on the clip
>> tested (cathedral), i.e. almost 30 cycles per mb faster.
>>
>
>> Arm assembly changes untested.
>
> fate-h264 (h264-conformance-ba_mw_d in this case but its not the only
> one)
>
> Program received signal SIGSEGV, Segmentation fault.
> ff_h264_idct_add8_neon () at ffmpeg/libavcodec/arm/h264idct_neon.S:166
> 166 ldrsh r8, [r1]
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index a2521b7..8d85227 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -105,6 +105,7 @@ function ff_h264_idct_add16_neon, export=1
ldr r0, [r5], #4
ldrb r8, [r6, r8]
subs r8, r8, #1
+ mov r3, r1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
@@ -116,7 +117,7 @@ function ff_h264_idct_add16_neon, export=1
adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
blx lr
2: subs ip, ip, #1
- add r1, r1, #32
+ add r1, r3, #32
bne 1b
pop {r4-r8,pc}
endfunc
@@ -136,13 +137,14 @@ function ff_h264_idct_add16intra_neon, export=1
add r0, r0, r4
cmp r8, #0
ldrsh r8, [r1]
+ mov r3, r1
iteet ne
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
subs ip, ip, #1
- add r1, r1, #32
+ add r1, r3, #32
bne 1b
pop {r4-r8,pc}
endfunc
?
Ronald
More information about the ffmpeg-devel
mailing list