[FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter

Tue Apr 29 10:51:15 EEST 2025

> On Apr 25, 2025, at 16:25, Martin Storsjö <martin at martin.st> wrote:
> 
> On Tue, 15 Apr 2025, Zhao Zhili wrote:
> 
>> From: Zhao Zhili <zhilizhao at tencent.com>
>> 
>> int8_t[] is enough for offset_table of 8 bit streams.
>> 
>> On rpi5:
>>                            Before               After
>> hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)
>> hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)
>> hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)
>> hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)
>> hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)
>> hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)
>> hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)
>> hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)
>> hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)
>> hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)
>> ---
>> libavcodec/aarch64/h26x/dsp.h             |  4 +
>> libavcodec/aarch64/h26x/sao_neon.S        | 93 ++++++++++++++---------
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  4 +-
>> libavcodec/aarch64/vvc/dsp_init.c         |  5 +-
>> 4 files changed, 65 insertions(+), 41 deletions(-)
>> 
>> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
>> index 0fefb4d70f..6ea6a8d36a 100644
>> --- a/libavcodec/aarch64/h26x/dsp.h
>> +++ b/libavcodec/aarch64/h26x/dsp.h
>> @@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
>>                                        ptrdiff_t stride_dst, ptrdiff_t stride_src,
>>                                        const int16_t *sao_offset_val, int sao_left_class,
>>                                        int width, int height);
>> +void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src,
>> +                                        ptrdiff_t stride_dst, ptrdiff_t stride_src,
>> +                                        const int16_t *sao_offset_val, int sao_left_class,
>> +                                        int width, int height);
>> void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>>                                          const int16_t *sao_offset_val, int eo, int width, int height);
>> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>> diff --git a/libavcodec/aarch64/h26x/sao_neon.S b/libavcodec/aarch64/h26x/sao_neon.S
>> index c43820135e..60c026fe95 100644
>> --- a/libavcodec/aarch64/h26x/sao_neon.S
>> +++ b/libavcodec/aarch64/h26x/sao_neon.S
>> @@ -35,48 +35,67 @@
>> //                      int16_t *sao_offset_val, int sao_left_class,
>> //                      int width, int height)
>> function ff_h26x_sao_band_filter_8x8_8_neon, export=1
>> -        stp             xzr, xzr, [sp, #-64]!
>> +        stp             xzr, xzr, [sp, #-32]!
>>        stp             xzr, xzr, [sp, #16]
>> -        stp             xzr, xzr, [sp, #32]
>> -        stp             xzr, xzr, [sp, #48]
>>        mov             w8,  #4
>> -0:      ldrsh           x9, [x4,  x8, lsl #1]      // sao_offset_val[k+1]
>> -        subs            w8,  w8,  #1
>> -        add             w10, w8,  w5               // k + sao_left_class
>> +0:
>> +        ldrsh           x9, [x4, x8, lsl #1]        // sao_offset_val[k+1]
>> +        subs            w8, w8, #1
>> +        add             w10, w8, w5                 // k + sao_left_class
>>        and             w10, w10, #0x1F
>> -        strh            w9, [sp, x10, lsl #1]
>> +        strb            w9, [sp, x10]
>>        bne             0b
>> -        add             w6,  w6,  #7
>> -        bic             w6,  w6,  #7
>> -        ld1             {v16.16b-v19.16b}, [sp], #64
>> -        sub             x2,  x2,  x6
>> -        sub             x3,  x3,  x6
>> -        movi            v20.8h,   #1
>> -1:      mov             w8,  w6                    // beginning of line
>> -2:      // Simple layout for accessing 16bit values
>> -        // with 8bit LUT.
>> -        //
>> -        //   00  01  02  03  04  05  06  07
>> -        // +----------------------------------->
>> -        // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
>> -        // +----------------------------------->
>> -        //    i-0     i-1     i-2     i-3
>> -        ld1             {v2.8b}, [x1], #8          // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
>> -        subs            w8, w8,  #8
>> -        uxtl            v0.8h,  v2.8b              // load src[x]
>> -        ushr            v2.8h,  v0.8h, #3          // >> BIT_DEPTH - 3
>> -        shl             v1.8h,  v2.8h, #1          // low (x2, accessing short)
>> -        add             v3.8h,  v1.8h, v20.8h      // +1 access upper short
>> -        sli             v1.8h,  v3.8h, #8          // shift insert index to upper byte
>> -        tbx             v2.16b, {v16.16b-v19.16b}, v1.16b // table
>> -        add             v1.8h,  v0.8h, v2.8h       // src[x] + table
>> -        sqxtun          v4.8b,  v1.8h              // clip + narrow
>> -        st1             {v4.8b}, [x0], #8          // store
>> -        // done 8 pixels
>> +        ldp             q16, q17, [sp], #32
>> +1:
>> +        ld1             {v2.8b}, [x1], x3
>> +        subs            w7, w7, #1
>> +        uxtl            v0.8h, v2.8b
>> +        ushr            v3.8b, v2.8b, #3          // >> BIT_DEPTH - 3
> 
> Nitpick: The comment on this line seems to be misaligned with the other comments below - please check.

Fixed before push.

> 
>> +        tbx             v3.8b, {v16.16b-v17.16b}, v3.8b
> 
> Is there any specific reason for preferring tbx over tbl here? (I know the existing code used tbx.) Without having studied cycle tables, I would expect tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx is faster)?

tbl can be faster. The result is quite impressive. Changed to tbl before push.

                             Before               tbx             tbl
hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)    252.3 ( 1.00x)
hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)     61.0 ( 4.57x)
hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)    864.9 ( 1.00x)
hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)    150.0 ( 6.26x)
hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)   3871.6 ( 1.00x)
hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)    550.6 ( 7.39)
hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)   8182.6 ( 1.00x)
hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)   1185.8 ( 7.36x)
hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)  14038.9 ( 1.00x)
hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)   2078.4 ( 7.15x)

> 
> 
> Other than these comments, this patch looks good to me, thanks - feel free to push.
> 
> // Martin
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org <mailto:ffmpeg-devel at ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org <mailto:ffmpeg-devel-request at ffmpeg.org> with subject "unsubscribe".