[FFmpeg-devel] [PATCH 1/4] lavc/aarch64: add hevc sao edge 16x16

Tue Oct 19 11:30:29 EEST 2021

On Thu, 7 Oct 2021, J. Dekker wrote:

> --bench on AWS Graviton:
>
> hevc_sao_edge_16x16_8_c: 1857.0
> hevc_sao_edge_16x16_8_neon: 211.0
> hevc_sao_edge_32x32_8_c: 7802.2
> hevc_sao_edge_32x32_8_neon: 808.2
> hevc_sao_edge_48x48_8_c: 16764.2
> hevc_sao_edge_48x48_8_neon: 1796.5
> hevc_sao_edge_64x64_8_c: 32647.5
> hevc_sao_edge_64x64_8_neon: 3118.5
>
> Signed-off-by: J. Dekker <jdek at itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  8 ++-
> libavcodec/aarch64/hevcdsp_sao_neon.S     | 66 +++++++++++++++++++++++
> 2 files changed, 72 insertions(+), 2 deletions(-)
>
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index c785e46f79..747ff0412d 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
>                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
>                                   int16_t *sao_offset_val, int sao_left_class,
>                                   int width, int height);
> -
> -
> +void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst,
> +                                          int16_t *sao_offset_val, int eo, int width, int height);
>
> av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> {
> @@ -76,6 +76,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
>         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
>         c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
> +        c->sao_edge_filter[1]          =
> +        c->sao_edge_filter[2]          =
> +        c->sao_edge_filter[3]          =
> +        c->sao_edge_filter[4]          = ff_hevc_sao_edge_filter_16x16_8_neon;
>     }
>     if (bit_depth == 10) {
>         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
> diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
> index f9fed8345b..a7f054c075 100644
> --- a/libavcodec/aarch64/hevcdsp_sao_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
> @@ -85,3 +85,69 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
>         bne             1b
>         ret
> endfunc
> +
> +// ASSUMES STRIDE_SRC = 192
> +.Lsao_edge_pos:
> +.word 1 // horizontal
> +.word 192 // vertical
> +.word 192 + 1 // 45 degree
> +.word 192 - 1 // 135 degree
> +
> +// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst,
> +//                                      int16 *sao_offset_val, int eo, int width, int height)
> +function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
> +       lsl             w4, w4, #2
> +       adr             x7, .Lsao_edge_pos
> +       ld1             {v3.8h}, [x3]              // load sao_offset_val
> +       sxtw            x5, w5
> +       ldr             w4, [x7, x4]               // stride_src

You can fold the lsl into the ldr here, as "ldr w4, [x7, w4, uxtw #2]".

> +       mov             v3.h[7], v3.h[0]           // reorder to [1,2,0,3,4]
> +       mov             v3.h[0], v3.h[1]
> +       mov             v3.h[1], v3.h[2]
> +       mov             v3.h[2], v3.h[7]

This bit is going to be surprisingly slow, I think. If possible, it'd be 
nice if the sao_offset_val array could be in the right order from the get 
go.

Other alternatives to this would be a tbl permutation, or a combination of 
ext + some masking, but I don't think it'd be significantly faster either. 
I guess this is fine as it's still only the setup for the function, with 
the body probably being much heavier anyway.

> +       // split 16bit values into two tables
> +       uzp2            v1.16b, v3.16b, v3.16b     // sao_offset_val -> upper
> +       uzp1            v0.16b, v3.16b, v3.16b     // sao_offset_val -> lower
> +       movi            v2.16b, #2
> +       mov             x15, #192
> +       // strides between end of line and next src/dst
> +       sub             x15, x15, x5               // stride_src - width

One could fold the sxtw of w5 into this line too, as "sub x15, x15, w5, 
sxtw", but as you need it sign extended in a couple places below too, 
keeping it as a separate instruction might be good.

> +       sub             x16, x2, x5                // stride_dst - width
> +       mov             x11, x1                    // copy base src
> +1:     // new line
> +       mov             x14, x5                    // copy width
> +       sub             x12, x11, x4               // src_a (prev) = src - sao_edge_pos
> +       add             x13, x11, x4               // src_b (next) = src + sao_edge_pos
> +2:     // process 16 bytes
> +       ld1             {v3.16b}, [x11], #16       // load src
> +       ld1             {v4.16b}, [x12], #16       // load src_a (prev)
> +       ld1             {v5.16b}, [x13], #16       // load src_b (next)
> +       cmhi            v16.16b, v4.16b, v3.16b    // (prev > cur)
> +       cmhi            v17.16b, v3.16b, v4.16b    // (cur > prev)
> +       cmhi            v18.16b, v5.16b, v3.16b    // (next > cur)
> +       cmhi            v19.16b, v3.16b, v5.16b    // (cur > next)
> +       sub             v20.16b, v16.16b, v17.16b  // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev)
> +       sub             v21.16b, v18.16b, v19.16b  // diff1 = CMP(cur, next) = (cur > next) - (cur < next)
> +       add             v20.16b, v20.16b, v21.16b  // diff = diff0 + diff1
> +       add             v20.16b, v20.16b, v2.16b   // offset_val = diff + 2
> +       tbl             v16.16b, {v0.16b}, v20.16b
> +       tbl             v17.16b, {v1.16b}, v20.16b
> +       zip1            v18.16b, v16.16b, v17.16b  // sao_offset_val lower ->
> +       zip2            v19.16b, v16.16b, v17.16b  // sao_offset_val upper ->
> +       uxtl            v20.8h, v3.8b              // src[0:7]
> +       uxtl2           v21.8h, v3.16b             // src[7:15]

I think it might be better to schedule the uxtl after the 'tbl' which can 
have a fair bit of latency.

> +       sqadd           v20.8h, v18.8h, v20.8h     // + sao_offset_val
> +       sqadd           v21.8h, v19.8h, v21.8h

If you can do without clipping, you could avoid the uxtl by doing uaddw, 
uaddw2 here instead. But it might turn out that the uxtl is essentially 
for free while waiting for the results from tbl anyway.

> +       sqxtun          v3.8b, v20.8h
> +       sqxtun2         v3.16b, v21.8h
> +       st1             {v3.16b}, [x0], #16
> +       subs            x14, x14, #16              // filtered 16 bytes
> +       b.ne            2b                         // do we have width to filter?
> +       // no width to filter, setup next line
> +       add             x11, x11, x15              // stride src to next line
> +       add             x0, x0, x16                // stride dst to next line
> +       subs            w6, w6, #1                 // filtered line
> +       b.ne            1b                         // do we have lines to process?
> +       // no lines to filter
> +       ret
> +endfunc
> -- 
> 2.30.1 (Apple Git-130)

Looks mostly ok, the permutation bit feels a bit surprising though.

// Martin