[FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
Zhao Zhili
quinkblack at foxmail.com
Tue Apr 29 10:51:15 EEST 2025
> On Apr 25, 2025, at 16:25, Martin Storsjö <martin at martin.st> wrote:
>
> On Tue, 15 Apr 2025, Zhao Zhili wrote:
>
>> From: Zhao Zhili <zhilizhao at tencent.com>
>>
>> int8_t[] is enough for offset_table of 8 bit streams.
>>
>> On rpi5:
>> Before After
>> hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x)
>> hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x)
>> hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x)
>> hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x)
>> hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x)
>> hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x)
>> hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x)
>> hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x)
>> hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x)
>> hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x)
>> ---
>> libavcodec/aarch64/h26x/dsp.h | 4 +
>> libavcodec/aarch64/h26x/sao_neon.S | 93 ++++++++++++++---------
>> libavcodec/aarch64/hevcdsp_init_aarch64.c | 4 +-
>> libavcodec/aarch64/vvc/dsp_init.c | 5 +-
>> 4 files changed, 65 insertions(+), 41 deletions(-)
>>
>> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
>> index 0fefb4d70f..6ea6a8d36a 100644
>> --- a/libavcodec/aarch64/h26x/dsp.h
>> +++ b/libavcodec/aarch64/h26x/dsp.h
>> @@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
>> ptrdiff_t stride_dst, ptrdiff_t stride_src,
>> const int16_t *sao_offset_val, int sao_left_class,
>> int width, int height);
>> +void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src,
>> + ptrdiff_t stride_dst, ptrdiff_t stride_src,
>> + const int16_t *sao_offset_val, int sao_left_class,
>> + int width, int height);
>> void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>> const int16_t *sao_offset_val, int eo, int width, int height);
>> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>> diff --git a/libavcodec/aarch64/h26x/sao_neon.S b/libavcodec/aarch64/h26x/sao_neon.S
>> index c43820135e..60c026fe95 100644
>> --- a/libavcodec/aarch64/h26x/sao_neon.S
>> +++ b/libavcodec/aarch64/h26x/sao_neon.S
>> @@ -35,48 +35,67 @@
>> // int16_t *sao_offset_val, int sao_left_class,
>> // int width, int height)
>> function ff_h26x_sao_band_filter_8x8_8_neon, export=1
>> - stp xzr, xzr, [sp, #-64]!
>> + stp xzr, xzr, [sp, #-32]!
>> stp xzr, xzr, [sp, #16]
>> - stp xzr, xzr, [sp, #32]
>> - stp xzr, xzr, [sp, #48]
>> mov w8, #4
>> -0: ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
>> - subs w8, w8, #1
>> - add w10, w8, w5 // k + sao_left_class
>> +0:
>> + ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
>> + subs w8, w8, #1
>> + add w10, w8, w5 // k + sao_left_class
>> and w10, w10, #0x1F
>> - strh w9, [sp, x10, lsl #1]
>> + strb w9, [sp, x10]
>> bne 0b
>> - add w6, w6, #7
>> - bic w6, w6, #7
>> - ld1 {v16.16b-v19.16b}, [sp], #64
>> - sub x2, x2, x6
>> - sub x3, x3, x6
>> - movi v20.8h, #1
>> -1: mov w8, w6 // beginning of line
>> -2: // Simple layout for accessing 16bit values
>> - // with 8bit LUT.
>> - //
>> - // 00 01 02 03 04 05 06 07
>> - // +----------------------------------->
>> - // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
>> - // +----------------------------------->
>> - // i-0 i-1 i-2 i-3
>> - ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
>> - subs w8, w8, #8
>> - uxtl v0.8h, v2.8b // load src[x]
>> - ushr v2.8h, v0.8h, #3 // >> BIT_DEPTH - 3
>> - shl v1.8h, v2.8h, #1 // low (x2, accessing short)
>> - add v3.8h, v1.8h, v20.8h // +1 access upper short
>> - sli v1.8h, v3.8h, #8 // shift insert index to upper byte
>> - tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table
>> - add v1.8h, v0.8h, v2.8h // src[x] + table
>> - sqxtun v4.8b, v1.8h // clip + narrow
>> - st1 {v4.8b}, [x0], #8 // store
>> - // done 8 pixels
>> + ldp q16, q17, [sp], #32
>> +1:
>> + ld1 {v2.8b}, [x1], x3
>> + subs w7, w7, #1
>> + uxtl v0.8h, v2.8b
>> + ushr v3.8b, v2.8b, #3 // >> BIT_DEPTH - 3
>
> Nitpick: The comment on this line seems to be misaligned with the other comments below - please check.
Fixed before push.
>
>> + tbx v3.8b, {v16.16b-v17.16b}, v3.8b
>
> Is there any specific reason for preferring tbx over tbl here? (I know the existing code used tbx.) Without having studied cycle tables, I would expect tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx is faster)?
tbl can be faster. The result is quite impressive. Changed to tbl before push.
Before tbx tbl
hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x) 252.3 ( 1.00x)
hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x) 61.0 ( 4.57x)
hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x) 864.9 ( 1.00x)
hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x) 150.0 ( 6.26x)
hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x) 3871.6 ( 1.00x)
hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x) 550.6 ( 7.39)
hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x) 8182.6 ( 1.00x)
hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x) 1185.8 ( 7.36x)
hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x) 14038.9 ( 1.00x)
hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x) 2078.4 ( 7.15x)
>
>
> Other than these comments, this patch looks good to me, thanks - feel free to push.
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org <mailto:ffmpeg-devel at ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org <mailto:ffmpeg-devel-request at ffmpeg.org> with subject "unsubscribe".
More information about the ffmpeg-devel
mailing list