[FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration

Fri Jul 15 22:32:11 EEST 2022

If the max height is just 16, then this should be fine. I assumed that h could have a much higher value (>1024), but if that is not the case, then this is a useful optimization.

Thanks!

-- 

Jonathan Swinney

On 7/13/22, 3:49 PM, "Martin Storsjö" <martin at martin.st> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.

    The max height is 16; the max difference per pixel is 255, and
    a .8h element can easily contain 16*255, thus keep accumulating
    in two .8h vectors, and just do the final accumulation at the
    end.

    This requires a minor register renumbering in ff_pix_abs16_xy2_neon.

    Before:       Cortex A53    A72    A73   Graviton 3
    pix_abs_0_0_neon:   97.7   47.0   37.5   22.7
    pix_abs_0_1_neon:  154.0   59.0   52.0   25.0
    pix_abs_0_3_neon:  179.7   96.7   87.5   41.2
    After:
    pix_abs_0_0_neon:   96.0   39.2   31.2   22.0
    pix_abs_0_1_neon:  150.7   59.7   46.2   23.7
    pix_abs_0_3_neon:  175.7   83.7   81.7   38.2

    ---
    I remember suggesting this before, and having it dismissed for
    some reason I don't remember - maybe that the element size wasn't
    big enough to hold the intermediate results? At least as far as I
    can see, it can hold the results just fine, and this passes all
    tests.
    ---
     libavcodec/aarch64/me_cmp_neon.S | 102 +++++++++++++++----------------
     1 file changed, 49 insertions(+), 53 deletions(-)

    diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
    index 89546869fb..cda7ce0408 100644
    --- a/libavcodec/aarch64/me_cmp_neon.S
    +++ b/libavcodec/aarch64/me_cmp_neon.S
    @@ -27,15 +27,16 @@ function ff_pix_abs16_neon, export=1
             // x3           ptrdiff_t stride
             // w4           int h
             cmp             w4, #4                      // if h < 4, jump to completion section
    -        movi            v18.4S, #0                  // clear result accumulator
    +        movi            v16.8h, #0                  // clear result accumulator
    +        movi            v17.8h, #0                  // clear result accumulator
             b.lt            2f
     1:
             ld1             {v0.16b}, [x1], x3          // load pix1
             ld1             {v4.16b}, [x2], x3          // load pix2
             ld1             {v1.16b}, [x1], x3          // load pix1
             ld1             {v5.16b}, [x2], x3          // load pix2
    -        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    -        uabdl2          v17.8h, v0.16b, v4.16b
    +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    +        uabal2          v17.8h, v0.16b, v4.16b
             ld1             {v2.16b}, [x1], x3          // load pix1
             ld1             {v6.16b}, [x2], x3          // load pix2
             uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
    @@ -48,27 +49,26 @@ function ff_pix_abs16_neon, export=1
             uabal           v16.8h, v3.8b, v7.8b
             uabal2          v17.8h, v3.16b, v7.16b
             cmp             w4, #4                      // if h >= 4, loop
    -        add             v16.8h, v16.8h, v17.8h
    -        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
    -        add             d18, d16, d18               // add to the end result register

             b.ge            1b
             cbnz            w4, 2f                      // if iterations remain, jump to completion section

    -        fmov            w0, s18                     // copy result to general purpose register
    +        add             v16.8h, v16.8h, v17.8h
    +        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
    +        fmov            w0, s16                     // copy result to general purpose register
             ret

     2:
             ld1             {v0.16b}, [x1], x3          // load pix1
             ld1             {v4.16b}, [x2], x3          // load pix2
    -        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    -        uabal2          v16.8h, v0.16b, v4.16b
             subs            w4, w4, #1                  // h -= 1
    -        addv            h16, v16.8h                 // add up v16
    -        add             d18, d16, d18               // add to result
    +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    +        uabal2          v17.8h, v0.16b, v4.16b
             b.ne            2b

    -        fmov            w0, s18                     // copy result to general purpose register
    +        add             v16.8h, v16.8h, v17.8h
    +        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
    +        fmov            w0, s16                     // copy result to general purpose register
             ret
     endfunc

    @@ -80,7 +80,8 @@ function ff_pix_abs16_xy2_neon, export=1
             // w4           int h

             add             x5, x2, x3                  // use x5 to hold uint8_t *pix3
    -        movi            v0.2d, #0                   // initialize the result register
    +        movi            v21.8h, #0                  // initialize the result register
    +        movi            v22.8h, #0                  // initialize the result register

             // Load initial pix2 values for either the unrolled version or completion version.
             ldur            q4, [x2, #1]                // load pix2+1
    @@ -119,15 +120,15 @@ function ff_pix_abs16_xy2_neon, export=1
             uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
             uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15

    -        ldur            q22, [x5, #1]               // load pix3+1
    +        ldur            q7, [x5, #1]                // load pix3+1

             add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
             add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above

    -        uabdl           v24.8h, v1.8b,  v23.8b      // absolute difference 0..7, i=0
    -        uabdl2          v23.8h, v1.16b, v23.16b     // absolute difference 8..15, i=0
    +        uabal           v21.8h, v1.8b,  v23.8b      // absolute difference 0..7, i=0
    +        uabal2          v22.8h, v1.16b, v23.16b     // absolute difference 8..15, i=0

    -        ld1             {v21.16b}, [x5], x3         // load pix3
    +        ld1             {v6.16b}, [x5], x3          // load pix3
             ld1             {v20.16b}, [x1], x3         // load pix1

             rshrn           v26.8b, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
    @@ -140,33 +141,33 @@ function ff_pix_abs16_xy2_neon, export=1
             rshrn           v28.8b, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
             rshrn2          v28.16b, v29.8h, #2         // shift right 2 8..15

    -        uabal           v24.8h, v16.8b,  v26.8b     // absolute difference 0..7, i=1
    -        uabal2          v23.8h, v16.16b, v26.16b    // absolute difference 8..15, i=1
    +        uabal           v21.8h, v16.8b,  v26.8b     // absolute difference 0..7, i=1
    +        uabal2          v22.8h, v16.16b, v26.16b    // absolute difference 8..15, i=1

    -        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
    -        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
    +        uaddl           v2.8h, v6.8b,  v7.8b        // pix3 + pix3+1 0..7
    +        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
             add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
             add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
             rshrn           v30.8b, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
             rshrn2          v30.16b, v31.8h, #2         // shift right 2 8..15

    -        uabal           v24.8h, v17.8b,  v28.8b     // absolute difference 0..7, i=2
    -        uabal2          v23.8h, v17.16b, v28.16b    // absolute difference 8..15, i=2
    -
             sub             w4, w4, #4                  // h -= 4

    -        uabal           v24.8h, v20.8b,  v30.8b     // absolute difference 0..7, i=3
    -        uabal2          v23.8h, v20.16b, v30.16b    // absolute difference 8..15, i=3
    +        uabal           v21.8h, v17.8b,  v28.8b     // absolute difference 0..7, i=2
    +        uabal2          v22.8h, v17.16b, v28.16b    // absolute difference 8..15, i=2

             cmp             w4, #4                      // loop if h >= 4

    -        add             v4.8h, v23.8h, v24.8h
    -        uaddlv          s4, v4.8h                   // finish adding up accumulated values
    -        add             d0, d0, d4                  // add the value to the top level accumulator
    +
    +        uabal           v21.8h, v20.8b,  v30.8b     // absolute difference 0..7, i=3
    +        uabal2          v22.8h, v20.16b, v30.16b    // absolute difference 8..15, i=3

             b.ge            1b
             cbnz            w4, 2f                      // if iterations remain jump to completion section

    +        add             v4.8h, v21.8h, v22.8h
    +        uaddlv          s0, v4.8h                   // finish adding up accumulated values
    +
             fmov            w0, s0                      // copy result to general purpose register
             ret
     2:
    @@ -182,20 +183,18 @@ function ff_pix_abs16_xy2_neon, export=1
             add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
             add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
             // divide by 4 to compute the average of values summed above
    -        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
    -        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
    -
    -        uxtl2           v7.8h, v1.16b               // 8->16 bits pix1 8..15
    -        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
    +        rshrn           v16.8b,  v16.8h, #2         // shift right by 2 0..7 (rounding shift right)
    +        rshrn2          v16.16b, v17.8h, #2         // shift right by 2 8..15

    -        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
    -        uaba            v6.8h, v7.8h, v17.8h        // absolute difference accumulate 8..15
    +        uabal           v21.8h, v1.8b,  v16.8b      // absolute difference 0..7
    +        uabal2          v22.8h, v1.16b, v16.16b     // absolute difference accumulate 8..15
             mov             v2.16b, v18.16b             // pix3 -> pix2
             mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
    -        uaddlv          s6, v6.8h                   // add up accumulator in v6
    -        add             d0, d0, d6                  // add to the final result

             b.ne            2b                          // loop if h > 0
    +
    +        add             v4.8h, v21.8h, v22.8h
    +        uaddlv          s0, v4.8h                   // finish adding up accumulated values
             fmov            w0, s0                      // copy result to general purpose register
             ret
     endfunc
    @@ -209,7 +208,8 @@ function ff_pix_abs16_x2_neon, export=1

             cmp             w4, #4
             // initialize buffers
    -        movi            d20, #0
    +        movi            v16.8h, #0
    +        movi            v17.8h, #0
             add             x5, x2, #1 // pix2 + 1
             b.lt            2f

    @@ -224,9 +224,9 @@ function ff_pix_abs16_x2_neon, export=1
             ld1             {v2.16b}, [x5], x3
             urhadd          v30.16b, v1.16b, v2.16b
             ld1             {v0.16b}, [x1], x3
    -        uabdl           v16.8h, v0.8b, v30.8b
    +        uabal           v16.8h, v0.8b, v30.8b
             ld1             {v4.16b}, [x2], x3
    -        uabdl2          v17.8h, v0.16b, v30.16b
    +        uabal2          v17.8h, v0.16b, v30.16b
             ld1             {v5.16b}, [x5], x3
             urhadd          v29.16b, v4.16b, v5.16b
             ld1             {v3.16b}, [x1], x3
    @@ -238,20 +238,15 @@ function ff_pix_abs16_x2_neon, export=1
             ld1             {v6.16b}, [x1], x3
             uabal           v16.8h, v6.8b, v28.8b
             ld1             {v24.16b}, [x2], x3
    +        sub             w4, w4, #4
             uabal2          v17.8h, v6.16b, v28.16b
             ld1             {v25.16b}, [x5], x3
             urhadd          v27.16b, v24.16b, v25.16b
             ld1             {v23.16b}, [x1], x3
    +        cmp             w4, #4
             uabal           v16.8h, v23.8b, v27.8b
             uabal2          v17.8h, v23.16b, v27.16b

    -        sub             w4, w4, #4
    -
    -        add             v16.8h, v16.8h, v17.8h
    -        uaddlv          s16, v16.8h
    -        cmp             w4, #4
    -        add             d20, d20, d16
    -
             b.ge            1b
             cbz             w4, 3f

    @@ -259,18 +254,19 @@ function ff_pix_abs16_x2_neon, export=1
     2:
             ld1             {v1.16b}, [x2], x3
             ld1             {v2.16b}, [x5], x3
    +        subs            w4, w4, #1
             urhadd          v29.16b, v1.16b, v2.16b
             ld1             {v0.16b}, [x1], x3
    -        uabd            v28.16b, v0.16b, v29.16b
    +        uabal           v16.8h, v0.8b,  v29.8b
    +        uabal2          v17.8h, v0.16b, v29.16b

    -        uaddlv          h28, v28.16b
    -        subs            w4, w4, #1

    -        add             d20, d20, d28
             b.ne            2b

     3:
    -        fmov            w0, s20
    +        add             v16.8h, v16.8h, v17.8h
    +        uaddlv          s16, v16.8h
    +        fmov            w0, s16

             ret
     endfunc
    --
    2.25.1