[FFmpeg-devel] [PATCH] ARM NEON dsputil encode functions
Måns Rullgård
mans
Mon Jul 13 13:05:51 CEST 2009
Ian Rickards <ian.rickards at btinternet.com> writes:
> Hi Mans, et al.
>
> Attached is a trivial patch to improve encode performance using ARM NEON
Let's see if we can make it less trivial ;-)
> Test on Cortex-A8/Beagleboard using h264 youtube video
> Decode only : 57.7 fps
> h263p encode (current) : 17.5 fps
> h263p encode (patched) : 25.3 fps
>
> => Speed up of 74%
>
> Thanks
>
>
> diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
> index 303b11c..add8c6d 100644
> --- a/libavcodec/arm/dsputil_neon_s.S
> +++ b/libavcodec/arm/dsputil_neon_s.S
> @@ -793,3 +793,194 @@ function ff_vector_fmul_window_neon, export=1
> vst1.64 {d22,d23},[ip,:128], r5
> pop {r4,r5,pc}
> .endfunc
> +
> +function ff_pix_abs16_neon, export=1
> + ldr ip, [sp]
> + vmov.i16 q8, #0
> +1:
> + vld1.8 {d0-d1}, [r1], r3
> + subs ip, ip, #1
> + vld1.8 {d2-d3}, [r2], r3
> + pld [r1, r3, lsl #1]
> + vabal.u8 q8, d0, d2
> + pld [r2, r3, lsl #1]
> + vabal.u8 q8, d1, d3
> + bne 1b
> + vpadd.u16 d16, d16, d17
> + vpaddl.u16 d16, d16
> + vpaddl.u32 d16, d16
> + vmov.u32 r0, d16[0]
> + bx lr
> + .endfunc
Using separate accumulator registers should improve scheduling here
since vabal has quite long latency. Also, r1 is 16-byte aligned, so
the loads from there should use an alignment specifier. Unrolling a
little may also help. Most of these functions have a minimum number
of iterations listed in dsputil.h along with alignment guarantees.
Take advantage of this.
Try something like this:
function ff_pix_abs16_neon, export=1
ldr ip, [sp]
vmov.i16 q8, #0
vmov.i16 q9, #0
vmov.i16 q10, #0
vmov.i16 q11, #0
1:
subs ip, ip, #2
vld1.8 {d0-d1}, [r1,:128], r3
vld1.8 {d2-d3}, [r2], r3
pld [r1, r3, lsl #1]
vabal.u8 q8, d0, d2
pld [r2, r3, lsl #1]
vld1.8 {d4-d5}, [r1,:128], r3
vabal.u8 q9, d1, d3
vld1.8 {d6-d7}, [r2], r3
pld [r1, r3, lsl #1]
vabal.u8 q10, d4, d6
pld [r2, r3, lsl #1]
vabal.u8 q11, d5, d7
bne 1b
vadd.i16 q8, q8, q9
vadd.i16 q10, q10, q11
vadd.i16 q8, q8, q10
vpadd.i16 d16, d16, d17
vpaddl.u16 d16, d16
vpaddl.u32 d16, d16
vmov.u32 r0, d16[0]
bx lr
.endfunc
This is untested, and may not work at all...
> +function ff_pix_abs8_neon, export=1
> + ldr ip, [sp]
> + vmov.i16 q8, #0
> +1:
> + vld1.8 {d1}, [r1], r3
> + subs ip, ip, #1
> + vld1.8 {d2}, [r2], r3
> + pld [r1, r3, lsl #1]
> + vabal.u8 q8, d1, d2
> + pld [r2, r3, lsl #1]
> + bne 1b
> + vpadd.u16 d16, d16, d17
> + vpaddl.u16 d16, d16
> + vpaddl.u32 d16, d16
> + vmov.u32 r0, d16[0]
> + bx lr
> + .endfunc
Same again. Fast NEON code is all about pipelining. This loop can
never dual-issue, and will stall one cycle on the vabal instruction.
Unrolling it once will allow some dual-issue and should have no
stalls.
> +function ff_pix_abs16_x2_neon, export=1
> + ldr ip, [sp]
> + vmov.i16 q8, #0
> + add r0, r2, #16
> +1:
> + vld1.8 {d0-d1}, [r1], r3
> + subs ip, ip, #1
> + vld1.8 {d2-d3}, [r2], r3
> + vld1.8 {d4[0]}, [r0], r3
> + vext.8 q3, q1, q2, #1
> + vrhadd.u8 q15, q3, q1
> + pld [r1, r3, lsl #1]
> + vabal.u8 q8, d0, d30
> + pld [r2, r3, lsl #1]
> + vabal.u8 q8, d1, d31
> + bne 1b
> + vpadd.u16 d16, d16, d17
> + vpaddl.u16 d16, d16
> + vpaddl.u32 d16, d16
> + vmov.u32 r0, d16[0]
> + bx lr
> + .endfunc
More of the same. Remember that vld1 and vext can dual-issue with
arithmetic instructions if you let them. Unrolling this loop should
make that possible. And as usual, use different destination registers
for the vabal instructions, or they'll stall (as written above for 3
cycles).
> +function ff_pix_abs16_y2_neon, export=1
> + ldr ip, [sp]
> + vmov.i16 q8, #0
> + vld1.8 {d2-d3}, [r2], r3
> +1:
> + vld1.8 {d4-d5}, [r2], r3
> + subs ip, ip, #1
> + vld1.8 {d0-d1}, [r1], r3
> + vrhadd.u8 q15, q1, q2
> + pld [r2, r3, lsl #1]
> + vmov q1, q2
> + vabal.u8 q8, d0, d30
> + pld [r1, r3, lsl #1]
> + vabal.u8 q8, d1, d31
> + bne 1b
> + vpadd.u16 d16, d16, d17
> + vpaddl.u16 d16, d16
> + vpaddl.u32 d16, d16
> + vmov.u32 r0, d16[0]
> + bx lr
> + .endfunc
> +
> +function ff_pix_abs16_xy2_neon, export=1
> + ldr ip, [sp]
> + vmov.u16 q15, #0
> + add r0, r2, #16
> + vld1.8 {d16-d17},[r2], r3
> + vld1.8 {d18[0]}, [r0], r3
> + vext.8 q10, q8, q9, #1
> + vaddl.u8 q2, d20, d16
> + vaddl.u8 q3, d21, d17
> +1:
> + vld1.8 {d16-d17},[r2], r3
> + subs ip, ip, #1
> + vld1.8 {d18[0]}, [r0], r3
> + vld1.8 {d0-d1}, [r1], r3
> + pld [r2, r3, lsl #1]
> + vext.8 q10, q8, q9, #1
> + vaddl.u8 q12, d20, d16
> + pld [r1, r3, lsl #1]
> + vaddl.u8 q13, d21, d17
> + vadd.u16 q10, q2, q12
> + vadd.u16 q11, q3, q13
> + vrshrn.u16 d2, q10, #2
> + vrshrn.u16 d3, q11, #2
> + vmov q2, q12
> + vmov q3, q13
> + vabal.u8 q15, d0, d2
> + vabal.u8 q15, d1, d3
> + bne 1b
> + vpadd.u16 d30, d30, d31
> + vpaddl.u16 d30, d30
> + vpaddl.u32 d30, d30
> + vmov.u32 r0, d30[0]
> + bx lr
> + .endfunc
Again very little room for dual-issue. Try to interleave vld1/vext
instructions with arithmetic as much as possible.
> +function ff_diff_pixels_neon, export=1
> + mov ip, #16
> +1:
> + vld1.8 {d0}, [r1], r3
> + vld1.8 {d1}, [r1], r3
> + vld1.8 {d2}, [r2], r3
> + vld1.8 {d3}, [r2], r3
> + vsubl.u8 q10, d0, d2
> + vsubl.u8 q11, d1, d3
> + vst1.16 {d20-d23},[r0]!
> + subs ip, ip, #4
> + bne 1b
> + bx lr
> + .endfunc
A little reordering here will allow some dual-issue, and alignment
should be specified.
> +function ff_pix_sum_neon, export=1
> + mov ip, #16
> + vmov.u32 q8, #0
> +1:
> + vld1.8 {d0-d1}, [r0], r1
> + subs ip, ip, #1
> + vpadal.u8 q8, q0
> + pld [r0, r1, lsl #1]
> + bne 1b
> + vpadd.u16 d0, d16, d17
> + vpaddl.u16 d0, d0
> + vpaddl.u32 d0, d0
> + vmov.u32 r0, d0[0]
> + bx lr
> + .endfunc
Unrolling this should let it dual-issue a bit.
> +function ff_pix_norm1_neon, export=1
> + mov ip, #16
> + vmov.u32 q8, #0
> +1:
> + vld1.8 {d0-d1}, [r0], r1
> + subs ip, ip, #1
> + vmovl.u8 q2, d0
> + pld [r0, r1, lsl #1]
> + vmovl.u8 q3, d1
> + vmlal.u16 q8, d4, d4
> + vmlal.u16 q8, d5, d5
> + vmlal.u16 q8, d6, d6
> + vmlal.u16 q8, d7, d7
> + bne 1b
> + vpadd.u32 d0, d16, d17
> + vpaddl.u32 d0, d0
> + vmov.u32 r0, d0[0]
> + bx lr
> + .endfunc
Have you spotted the pattern yet?
> +function ff_sse16_neon, export=1
> + ldr ip, [sp]
> + vmov.u32 q8, #0
> +1:
> + vld1.8 {d0-d1}, [r1], r3
> + subs ip, ip, #1
> + vld1.8 {d2-d3}, [r2], r3
> + pld [r1, r3, lsl #1]
> + vsubl.u8 q2, d0, d2
> + pld [r2, r3, lsl #1]
> + vsubl.u8 q3, d1, d3
> + vmlal.s16 q8, d4, d4
> + vmlal.s16 q8, d5, d5
> + vmlal.s16 q8, d6, d6
> + vmlal.s16 q8, d7, d7
> + bne 1b
> + vpadd.u32 d0, d16, d17
> + vpaddl.u32 d1, d0
> + vmov.u32 r0, d1[0]
> + bx lr
> + .endfunc
...
--
M?ns Rullg?rd
mans at mansr.com
More information about the ffmpeg-devel
mailing list