[FFmpeg-devel] [aarch64] improve performance of ff_hscale_8_to_15_neon
Clément Bœsch
u at pkh.me
Sun Dec 1 13:10:31 EET 2019
On Wed, Nov 27, 2019 at 12:30:35PM -0600, Sebastian Pop wrote:
[...]
> From 9ecaa99fab4b8bedf3884344774162636eaa5389 Mon Sep 17 00:00:00 2001
> From: Sebastian Pop <spop at amazon.com>
> Date: Sun, 17 Nov 2019 14:13:13 -0600
> Subject: [PATCH] [aarch64] use FMA and increase vector factor to 4
>
> This patch implements ff_hscale_8_to_15_neon with NEON fused multiply accumulate
> and bumps the vectorization factor from 2 to 4.
> The speedup is of 34% on Graviton A1 instances based on A-72 cpus:
>
> $ ffmpeg -nostats -f lavfi -i testsrc2=4k:d=2 -vf bench=start,scale=1024x1024,bench=stop -f null -
> before: t:0.040303 avg:0.040287 max:0.040371 min:0.039214
> after: t:0.030079 avg:0.030102 max:0.030462 min:0.030051
>
> Tested with `make check` on aarch64-linux.
> ---
> libswscale/aarch64/hscale.S | 102 ++++++++++++++++++++++++------------
> 1 file changed, 68 insertions(+), 34 deletions(-)
>
> diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
> index cc78c1901d..12ee7f09e7 100644
> --- a/libswscale/aarch64/hscale.S
> +++ b/libswscale/aarch64/hscale.S
> @@ -21,39 +21,73 @@
> #include "libavutil/aarch64/asm.S"
>
> function ff_hscale_8_to_15_neon, export=1
> - add x10, x4, w6, UXTW #1 // filter2 = filter + filterSize*2 (x2 because int16)
> -1: ldr w8, [x5], #4 // filterPos[0]
> - ldr w9, [x5], #4 // filterPos[1]
> - movi v4.4S, #0 // val sum part 1 (for dst[0])
> - movi v5.4S, #0 // val sum part 2 (for dst[1])
> - mov w7, w6 // filterSize counter
> - mov x13, x3 // srcp = src
> -2: add x11, x13, w8, UXTW // srcp + filterPos[0]
> - add x12, x13, w9, UXTW // srcp + filterPos[1]
> - ld1 {v0.8B}, [x11] // srcp[filterPos[0] + {0..7}]
> - ld1 {v1.8B}, [x12] // srcp[filterPos[1] + {0..7}]
> - ld1 {v2.8H}, [x4], #16 // load 8x16-bit filter values, part 1
> - ld1 {v3.8H}, [x10], #16 // ditto at filter+filterSize for part 2
> - uxtl v0.8H, v0.8B // unpack part 1 to 16-bit
> - uxtl v1.8H, v1.8B // unpack part 2 to 16-bit
> - smull v16.4S, v0.4H, v2.4H // v16.i32{0..3} = part 1 of: srcp[filterPos[0] + {0..7}] * filter[{0..7}]
> - smull v18.4S, v1.4H, v3.4H // v18.i32{0..3} = part 1 of: srcp[filterPos[1] + {0..7}] * filter[{0..7}]
> - smull2 v17.4S, v0.8H, v2.8H // v17.i32{0..3} = part 2 of: srcp[filterPos[0] + {0..7}] * filter[{0..7}]
> - smull2 v19.4S, v1.8H, v3.8H // v19.i32{0..3} = part 2 of: srcp[filterPos[1] + {0..7}] * filter[{0..7}]
> - addp v16.4S, v16.4S, v17.4S // horizontal pair adding of the 8x32-bit multiplied values for part 1 into 4x32-bit
> - addp v18.4S, v18.4S, v19.4S // horizontal pair adding of the 8x32-bit multiplied values for part 2 into 4x32-bit
> - add v4.4S, v4.4S, v16.4S // update val accumulator for part 1
> - add v5.4S, v5.4S, v18.4S // update val accumulator for part 2
> - add x13, x13, #8 // srcp += 8
> - subs w7, w7, #8 // processed 8/filterSize
> - b.gt 2b // inner loop if filterSize not consumed completely
> - mov x4, x10 // filter = filter2
> - add x10, x10, w6, UXTW #1 // filter2 += filterSize*2
> - addp v4.4S, v4.4S, v5.4S // horizontal pair adding of the 8x32-bit sums into 4x32-bit
> - addp v4.4S, v4.4S, v4.4S // horizontal pair adding of the 4x32-bit sums into 2x32-bit
> - sqshrn v4.4H, v4.4S, #7 // shift and clip the 2x16-bit final values
> - st1 {v4.S}[0], [x1], #4 // write to destination
> - subs w2, w2, #2 // dstW -= 2
> - b.gt 1b // loop until end of line
> + sxtw x9, w6
> + sbfiz x12, x6, #1, #32
> + add x14, x12, x9
> + mov x8, xzr
> + sxtw x10, w2
> + sbfiz x11, x6, #3, #32
> + sbfiz x13, x6, #2, #32
> + lsl x14, x14, #1
Did you get this weird dance from the compiler? I think comments would be
welcome here to document the different filterSize * N you're computing.
Also, computing the source pointers directly instead of just the offsets
would simplify things later on. See the original call with the add.
> +1: lsl x17, x8, #2
Can't you increment x8 by the appropriate amount at the end of the loop to
save this shift?
> + ldrsw x18, [x5, x17] // filterPos[0]
> + orr x0, x17, #0x4
> + orr x2, x17, #0x8
> + orr x17, x17, #0xc
> + ldrsw x0, [x5, x0] // filterPos[1]
> + ldrsw x2, [x5, x2] // filterPos[2]
> + ldrsw x6, [x5, x17] // filterPos[3]
> + mov x15, xzr // j = 0
> + mov x16, x4 // filter0 = filter
> + movi v0.2d, #0 // val sum part 1 (for dst[0])
> + movi v2.2d, #0 // val sum part 2 (for dst[1])
> + movi v1.2d, #0 // val sum part 3 (for dst[2])
> + movi v3.2d, #0 // val sum part 4 (for dst[3])
Please use the capitalized form for the 8b, 4h, 2d, etc for consistency
with the original code. It may help the diff in some cases.
> + add x17, x3, x18 // srcp + filterPos[0]
> + add x18, x3, x0 // srcp + filterPos[1]
> + add x0, x3, x2 // srcp + filterPos[2]
> + add x2, x3, x6 // srcp + filterPos[3]
> +2: ldr d4, [x17, x15] // srcp[filterPos[0] + {0..7}]
> + ldr q5, [x16] // load 8x16-bit filter values, part 1
> + ldr d6, [x18, x15] // srcp[filterPos[1] + {0..7}]
> + ldr q7, [x16, x12] // load 8x16-bit at filter+filterSize
Why not use ld1 {v4.8B} etc like it was before? The use of Dn/Qn in is
very confusing here.
> + ushll v4.8h, v4.8b, #0 // unpack part 1 to 16-bit
> + smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
> + smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
> + ldr d4, [x0, x15] // srcp[filterPos[2] + {0..7}]
> + ldr q5, [x16, x13] // load 8x16-bit at filter+2*filterSize
ditto ld1 {vN...}
> + ushll v6.8h, v6.8b, #0 // unpack part 2 to 16-bit
> + smlal v2.4s, v6.4h, v7.4h // v2 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
> + ushll v4.8h, v4.8b, #0 // unpack part 3 to 16-bit
> + smlal v1.4s, v4.4h, v5.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
> + smlal2 v1.4s, v4.8h, v5.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
> + ldr d4, [x2, x15] // srcp[filterPos[3] + {0..7}]
> + ldr q5, [x16, x14] // load 8x16-bit at filter+3*filterSize
ditto ld1 {vN...}
> + add x15, x15, #8 // j += 8
> + smlal2 v2.4s, v6.8h, v7.8h // v2 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
> + ushll v4.8h, v4.8b, #0 // unpack part 4 to 16-bit
> + smlal v3.4s, v4.4h, v5.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
> + cmp x15, x9 // j < filterSize
You can save this comparison if you use a decreasing counter with a subS
instruction (look at the original code)
> + smlal2 v3.4s, v4.8h, v5.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
> + add x16, x16, #16 // filter0 += 16
> + b.lt 2b // inner loop if filterSize not consumed completely
> + addp v0.4s, v0.4s, v0.4s // part1 horizontal pair adding
> + addp v2.4s, v2.4s, v2.4s // part2 horizontal pair adding
> + addp v1.4s, v1.4s, v1.4s // part3 horizontal pair adding
> + addp v3.4s, v3.4s, v3.4s // part4 horizontal pair adding
> + addp v0.4s, v0.4s, v0.4s // part1 horizontal pair adding
> + addp v2.4s, v2.4s, v2.4s // part2 horizontal pair adding
> + addp v1.4s, v1.4s, v1.4s // part3 horizontal pair adding
> + addp v3.4s, v3.4s, v3.4s // part4 horizontal pair adding
> + zip1 v0.4s, v0.4s, v2.4s // part12 = zip values from part1 and part2
> + zip1 v1.4s, v1.4s, v3.4s // part34 = zip values from part3 and part4
> + lsl x15, x8, #1
> + add x8, x8, #4 // i += 4
> + mov v0.d[1], v1.d[0] // part1234 = zip values from part12 and part34
> + cmp x8, x10 // i < dstW
ditto subs
> + sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
> + add x4, x4, x11 // filter += filterSize*4
> + str d0, [x1, x15] // write to destination part1234
> + b.lt 1b // loop until end of line
> ret
--
Clément B.
More information about the ffmpeg-devel
mailing list