[FFmpeg-devel] [PATCH v2 6/9] swscale/arm/yuv2rgb: macro-ify
Benoit Fouet
benoit.fouet at free.fr
Thu Mar 31 11:36:12 CEST 2016
On 28/03/2016 21:19, Matthieu Bouron wrote:
> ---
> libswscale/arm/yuv2rgb_neon.S | 137 ++++++++++++++++++------------------------
> 1 file changed, 60 insertions(+), 77 deletions(-)
>
> diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S
> index ef7b0a6..e1b68c1 100644
> --- a/libswscale/arm/yuv2rgb_neon.S
> +++ b/libswscale/arm/yuv2rgb_neon.S
> @@ -64,7 +64,7 @@
> vmov.u8 \a2, #255
> .endm
>
> -.macro compute_16px dst y0 y1 ofmt
> +.macro compute dst y0 y1 ofmt
> vmovl.u8 q14, \y0 @ 8px of y
> vmovl.u8 q15, \y1 @ 8px of y
>
> @@ -99,23 +99,23 @@
>
> .endm
>
> -.macro process_1l_16px ofmt
> +.macro process_1l ofmt
> compute_premult d28, d29, d30, d31
> vld1.8 {q7}, [r4]!
> - compute_16px r2, d14, d15, \ofmt
> + compute r2, d14, d15, \ofmt
> .endm
>
> -.macro process_2l_16px ofmt
> +.macro process_2l ofmt
> compute_premult d28, d29, d30, d31
>
> vld1.8 {q7}, [r4]! @ first line of luma
> - compute_16px r2, d14, d15, \ofmt
> + compute r2, d14, d15, \ofmt
>
> vld1.8 {q7}, [r12]! @ second line of luma
> - compute_16px r11, d14, d15, \ofmt
> + compute r11, d14, d15, \ofmt
> .endm
>
> -.macro load_args_nvx
> +.macro load_args_nv12
> push {r4-r12, lr}
> vpush {q4-q7}
> ldr r4, [sp, #104] @ r4 = srcY
> @@ -136,6 +136,10 @@
> sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
> .endm
>
> +.macro load_args_nv21
> + load_args_nv12
> +.endm
> +
> .macro load_args_yuv420p
> push {r4-r12, lr}
> vpush {q4-q7}
> @@ -176,55 +180,23 @@
> ldr r10,[sp, #120] @ r10 = srcV
> .endm
>
> -.macro declare_func ifmt ofmt
> -function ff_\ifmt\()_to_\ofmt\()_neon, export=1
> -
> -.ifc \ifmt,nv12
> - load_args_nvx
> -.endif
> -
> -.ifc \ifmt,nv21
> - load_args_nvx
> -.endif
> -
> -.ifc \ifmt,yuv420p
> - load_args_yuv420p
> -.endif
> -
> -
> -.ifc \ifmt,yuv422p
> - load_args_yuv422p
> -.endif
> -
> -1:
> - mov r8, r0 @ r8 = width
> -2:
> - pld [r6, #64*3]
> - pld [r4, #64*3]
> -
> - vmov.i8 d10, #128
> -
> -.ifc \ifmt,nv12
> +.macro load_chroma_nv12
> pld [r12, #64*3]
>
> vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
> vsubl.u8 q14, d2, d10 @ q14 = U - 128
> vsubl.u8 q15, d3, d10 @ q15 = V - 128
> +.endm
>
> - process_2l_16px \ofmt
> -.endif
> -
> -.ifc \ifmt,nv21
> +.macro load_chroma_nv21
> pld [r12, #64*3]
>
> vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
> vsubl.u8 q14, d3, d10 @ q14 = U - 128
> vsubl.u8 q15, d2, d10 @ q15 = V - 128
> +.endm
>
> - process_2l_16px \ofmt
> -.endif
> -
> -.ifc \ifmt,yuv420p
> +.macro load_chroma_yuv420p
> pld [r10, #64*3]
> pld [r12, #64*3]
>
> @@ -232,68 +204,79 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
> vld1.8 d3, [r10]! @ d3: chroma blue line
> vsubl.u8 q14, d2, d10 @ q14 = U - 128
> vsubl.u8 q15, d3, d10 @ q15 = V - 128
> +.endm
>
> - process_2l_16px \ofmt
> -.endif
> -
> -.ifc \ifmt,yuv422p
> +.macro load_chroma_yuv422p
> pld [r10, #64*3]
>
> vld1.8 d2, [r6]! @ d2: chroma red line
> vld1.8 d3, [r10]! @ d3: chroma blue line
> vsubl.u8 q14, d2, d10 @ q14 = U - 128
> vsubl.u8 q15, d3, d10 @ q15 = V - 128
> +.endm
>
> - process_1l_16px \ofmt
> -.endif
> -
> - subs r8, r8, #16 @ width -= 16
> - bgt 2b
> -
> - add r2, r2, r3 @ dst += padding
> - add r4, r4, r5 @ srcY += paddingY
> -
> -.ifc \ifmt,nv12
> +.macro increment_nv12
> add r11, r11, r3 @ dst2 += padding
> add r12, r12, r5 @ srcY2 += paddingY
> -
> add r6, r6, r7 @ srcC += paddingC
> -
> subs r1, r1, #2 @ height -= 2
> -.endif
> -
> -.ifc \ifmt,nv21
> - add r11, r11, r3 @ dst2 += padding
> - add r12, r12, r5 @ srcY2 += paddingY
> +.endm
>
> - add r6, r6, r7 @ srcC += paddingC
> - subs r1, r1, #2 @ height -= 2
> -.endif
> +.macro increment_nv21
> + increment_nv12
> +.endm
>
> -.ifc \ifmt,yuv420p
> +.macro increment_yuv420p
> add r11, r11, r3 @ dst2 += padding
> add r12, r12, r5 @ srcY2 += paddingY
> -
> ldr r7, [sp, #116] @ r7 = linesizeU
> sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
> add r6, r6, r7 @ srcU += paddingU
> -
> ldr r7, [sp, #124] @ r7 = linesizeV
> sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
> add r10, r10, r7 @ srcV += paddingV
> -
> subs r1, r1, #2 @ height -= 2
> -.endif
> +.endm
>
> -.ifc \ifmt,yuv422p
> +.macro increment_yuv422p
> add r6, r6, r7 @ srcU += paddingU
> add r10,r10,r12 @ srcV += paddingV
> -
> subs r1, r1, #1 @ height -= 1
> -.endif
> +.endm
>
> - bgt 1b
> +.macro process_nv12 ofmt
> + process_2l \ofmt
> +.endm
> +
> +.macro process_nv21 ofmt
> + process_2l \ofmt
> +.endm
> +
> +.macro process_yuv420p ofmt
> + process_2l \ofmt
> +.endm
>
> +.macro process_yuv422p ofmt
> + process_1l \ofmt
> +.endm
> +
> +.macro declare_func ifmt ofmt
> +function ff_\ifmt\()_to_\ofmt\()_neon, export=1
> + load_args_\ifmt
> +1:
> + mov r8, r0 @ r8 = width
> +2:
> + pld [r6, #64*3]
> + pld [r4, #64*3]
> + vmov.i8 d10, #128
> + load_chroma_\ifmt
> + process_\ifmt \ofmt
> + subs r8, r8, #16 @ width -= 16
> + bgt 2b
> + add r2, r2, r3 @ dst += padding
> + add r4, r4, r5 @ srcY += paddingY
> + increment_\ifmt
> + bgt 1b
> vpop {q4-q7}
> pop {r4-r12, lr}
> mov pc, lr
More information about the ffmpeg-devel
mailing list