[FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
Swinney, Jonathan
jswinney at amazon.com
Fri Jul 15 22:32:11 EEST 2022
If the max height is just 16, then this should be fine. I assumed that h could have a much higher value (>1024), but if that is not the case, then this is a useful optimization.
Thanks!
--
Jonathan Swinney
On 7/13/22, 3:49 PM, "Martin Storsjö" <martin at martin.st> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
The max height is 16; the max difference per pixel is 255, and
a .8h element can easily contain 16*255, thus keep accumulating
in two .8h vectors, and just do the final accumulation at the
end.
This requires a minor register renumbering in ff_pix_abs16_xy2_neon.
Before: Cortex A53 A72 A73 Graviton 3
pix_abs_0_0_neon: 97.7 47.0 37.5 22.7
pix_abs_0_1_neon: 154.0 59.0 52.0 25.0
pix_abs_0_3_neon: 179.7 96.7 87.5 41.2
After:
pix_abs_0_0_neon: 96.0 39.2 31.2 22.0
pix_abs_0_1_neon: 150.7 59.7 46.2 23.7
pix_abs_0_3_neon: 175.7 83.7 81.7 38.2
---
I remember suggesting this before, and having it dismissed for
some reason I don't remember - maybe that the element size wasn't
big enough to hold the intermediate results? At least as far as I
can see, it can hold the results just fine, and this passes all
tests.
---
libavcodec/aarch64/me_cmp_neon.S | 102 +++++++++++++++----------------
1 file changed, 49 insertions(+), 53 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 89546869fb..cda7ce0408 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -27,15 +27,16 @@ function ff_pix_abs16_neon, export=1
// x3 ptrdiff_t stride
// w4 int h
cmp w4, #4 // if h < 4, jump to completion section
- movi v18.4S, #0 // clear result accumulator
+ movi v16.8h, #0 // clear result accumulator
+ movi v17.8h, #0 // clear result accumulator
b.lt 2f
1:
ld1 {v0.16b}, [x1], x3 // load pix1
ld1 {v4.16b}, [x2], x3 // load pix2
ld1 {v1.16b}, [x1], x3 // load pix1
ld1 {v5.16b}, [x2], x3 // load pix2
- uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
- uabdl2 v17.8h, v0.16b, v4.16b
+ uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabal2 v17.8h, v0.16b, v4.16b
ld1 {v2.16b}, [x1], x3 // load pix1
ld1 {v6.16b}, [x2], x3 // load pix2
uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate
@@ -48,27 +49,26 @@ function ff_pix_abs16_neon, export=1
uabal v16.8h, v3.8b, v7.8b
uabal2 v17.8h, v3.16b, v7.16b
cmp w4, #4 // if h >= 4, loop
- add v16.8h, v16.8h, v17.8h
- uaddlv s16, v16.8h // add up everything in v16 accumulator
- add d18, d16, d18 // add to the end result register
b.ge 1b
cbnz w4, 2f // if iterations remain, jump to completion section
- fmov w0, s18 // copy result to general purpose register
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h // add up everything in v16 accumulator
+ fmov w0, s16 // copy result to general purpose register
ret
2:
ld1 {v0.16b}, [x1], x3 // load pix1
ld1 {v4.16b}, [x2], x3 // load pix2
- uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
- uabal2 v16.8h, v0.16b, v4.16b
subs w4, w4, #1 // h -= 1
- addv h16, v16.8h // add up v16
- add d18, d16, d18 // add to result
+ uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabal2 v17.8h, v0.16b, v4.16b
b.ne 2b
- fmov w0, s18 // copy result to general purpose register
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h // add up everything in v16 accumulator
+ fmov w0, s16 // copy result to general purpose register
ret
endfunc
@@ -80,7 +80,8 @@ function ff_pix_abs16_xy2_neon, export=1
// w4 int h
add x5, x2, x3 // use x5 to hold uint8_t *pix3
- movi v0.2d, #0 // initialize the result register
+ movi v21.8h, #0 // initialize the result register
+ movi v22.8h, #0 // initialize the result register
// Load initial pix2 values for either the unrolled version or completion version.
ldur q4, [x2, #1] // load pix2+1
@@ -119,15 +120,15 @@ function ff_pix_abs16_xy2_neon, export=1
uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
- ldur q22, [x5, #1] // load pix3+1
+ ldur q7, [x5, #1] // load pix3+1
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
- uabdl v24.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
- uabdl2 v23.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
+ uabal v21.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
+ uabal2 v22.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
- ld1 {v21.16b}, [x5], x3 // load pix3
+ ld1 {v6.16b}, [x5], x3 // load pix3
ld1 {v20.16b}, [x1], x3 // load pix1
rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right)
@@ -140,33 +141,33 @@ function ff_pix_abs16_xy2_neon, export=1
rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15
- uabal v24.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
- uabal2 v23.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
+ uabal v21.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
+ uabal2 v22.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
- uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7
- uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15
+ uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
+ uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15
- uabal v24.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
- uabal2 v23.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
-
sub w4, w4, #4 // h -= 4
- uabal v24.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
- uabal2 v23.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
+ uabal v21.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
+ uabal2 v22.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
cmp w4, #4 // loop if h >= 4
- add v4.8h, v23.8h, v24.8h
- uaddlv s4, v4.8h // finish adding up accumulated values
- add d0, d0, d4 // add the value to the top level accumulator
+
+ uabal v21.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
+ uabal2 v22.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
b.ge 1b
cbnz w4, 2f // if iterations remain jump to completion section
+ add v4.8h, v21.8h, v22.8h
+ uaddlv s0, v4.8h // finish adding up accumulated values
+
fmov w0, s0 // copy result to general purpose register
ret
2:
@@ -182,20 +183,18 @@ function ff_pix_abs16_xy2_neon, export=1
add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
// divide by 4 to compute the average of values summed above
- urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
- urshr v17.8h, v17.8h, #2 // shift right by 2 8..15
-
- uxtl2 v7.8h, v1.16b // 8->16 bits pix1 8..15
- uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7
+ rshrn v16.8b, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
+ rshrn2 v16.16b, v17.8h, #2 // shift right by 2 8..15
- uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7
- uaba v6.8h, v7.8h, v17.8h // absolute difference accumulate 8..15
+ uabal v21.8h, v1.8b, v16.8b // absolute difference 0..7
+ uabal2 v22.8h, v1.16b, v16.16b // absolute difference accumulate 8..15
mov v2.16b, v18.16b // pix3 -> pix2
mov v3.16b, v19.16b // pix3+1 -> pix2+1
- uaddlv s6, v6.8h // add up accumulator in v6
- add d0, d0, d6 // add to the final result
b.ne 2b // loop if h > 0
+
+ add v4.8h, v21.8h, v22.8h
+ uaddlv s0, v4.8h // finish adding up accumulated values
fmov w0, s0 // copy result to general purpose register
ret
endfunc
@@ -209,7 +208,8 @@ function ff_pix_abs16_x2_neon, export=1
cmp w4, #4
// initialize buffers
- movi d20, #0
+ movi v16.8h, #0
+ movi v17.8h, #0
add x5, x2, #1 // pix2 + 1
b.lt 2f
@@ -224,9 +224,9 @@ function ff_pix_abs16_x2_neon, export=1
ld1 {v2.16b}, [x5], x3
urhadd v30.16b, v1.16b, v2.16b
ld1 {v0.16b}, [x1], x3
- uabdl v16.8h, v0.8b, v30.8b
+ uabal v16.8h, v0.8b, v30.8b
ld1 {v4.16b}, [x2], x3
- uabdl2 v17.8h, v0.16b, v30.16b
+ uabal2 v17.8h, v0.16b, v30.16b
ld1 {v5.16b}, [x5], x3
urhadd v29.16b, v4.16b, v5.16b
ld1 {v3.16b}, [x1], x3
@@ -238,20 +238,15 @@ function ff_pix_abs16_x2_neon, export=1
ld1 {v6.16b}, [x1], x3
uabal v16.8h, v6.8b, v28.8b
ld1 {v24.16b}, [x2], x3
+ sub w4, w4, #4
uabal2 v17.8h, v6.16b, v28.16b
ld1 {v25.16b}, [x5], x3
urhadd v27.16b, v24.16b, v25.16b
ld1 {v23.16b}, [x1], x3
+ cmp w4, #4
uabal v16.8h, v23.8b, v27.8b
uabal2 v17.8h, v23.16b, v27.16b
- sub w4, w4, #4
-
- add v16.8h, v16.8h, v17.8h
- uaddlv s16, v16.8h
- cmp w4, #4
- add d20, d20, d16
-
b.ge 1b
cbz w4, 3f
@@ -259,18 +254,19 @@ function ff_pix_abs16_x2_neon, export=1
2:
ld1 {v1.16b}, [x2], x3
ld1 {v2.16b}, [x5], x3
+ subs w4, w4, #1
urhadd v29.16b, v1.16b, v2.16b
ld1 {v0.16b}, [x1], x3
- uabd v28.16b, v0.16b, v29.16b
+ uabal v16.8h, v0.8b, v29.8b
+ uabal2 v17.8h, v0.16b, v29.16b
- uaddlv h28, v28.16b
- subs w4, w4, #1
- add d20, d20, d28
b.ne 2b
3:
- fmov w0, s20
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h
+ fmov w0, s16
ret
endfunc
--
2.25.1
More information about the ffmpeg-devel
mailing list