[FFmpeg-devel] [PATCH v2 4/7] x86/vvcdec: sao, add avx2 support
Nuo Mi
nuomi2021 at gmail.com
Sat May 3 12:13:16 EEST 2025
From: Shaun Loo <shaunloo10 at gmail.com>
This is a part of Google Summer of Code 2023
Co-authored-by: Nuo Mi <nuomi2021 at gmail.com>
---
libavcodec/x86/h26x/h2656_sao.asm | 8 +--
libavcodec/x86/vvc/Makefile | 2 +
libavcodec/x86/vvc/dsp_init.c | 41 +++++++++++
libavcodec/x86/vvc/sao.asm | 73 +++++++++++++++++++
libavcodec/x86/vvc/sao_10bit.asm | 113 ++++++++++++++++++++++++++++++
5 files changed, 233 insertions(+), 4 deletions(-)
create mode 100644 libavcodec/x86/vvc/sao.asm
create mode 100644 libavcodec/x86/vvc/sao_10bit.asm
diff --git a/libavcodec/x86/h26x/h2656_sao.asm b/libavcodec/x86/h26x/h2656_sao.asm
index 504fcb388b..a80ee26178 100644
--- a/libavcodec/x86/h26x/h2656_sao.asm
+++ b/libavcodec/x86/h26x/h2656_sao.asm
@@ -147,7 +147,7 @@ align 16
%assign i i+mmsize
%endrep
-%if %2 == 48
+%if %2 == 48 || %2 == 80 || %2 == 112
INIT_XMM cpuname
mova m13, [srcq + i]
@@ -160,7 +160,7 @@ INIT_XMM cpuname
%if cpuflag(avx2)
INIT_YMM cpuname
%endif
-%endif ; %2 == 48
+%endif ; %2 == 48 || %2 == 80 || %2 == 112
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
@@ -280,7 +280,7 @@ align 16
%assign i i+mmsize
%endrep
-%if %2 == 48
+%if %2 == 48 || %2 == 80 || %2 == 112
INIT_XMM cpuname
mova m1, [srcq + i]
@@ -291,7 +291,7 @@ INIT_XMM cpuname
%if cpuflag(avx2)
INIT_YMM cpuname
%endif
-%endif
+%endif ; %2 == 48 || %2 == 80 || %2 == 112
add dstq, dststrideq
add srcq, EDGE_SRCSTRIDE
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index 86a6c8ba7c..c426b156c1 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -8,4 +8,6 @@ X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/alf.o \
x86/vvc/mc.o \
x86/vvc/of.o \
x86/vvc/sad.o \
+ x86/vvc/sao.o \
+ x86/vvc/sao_10bit.o \
x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index bb68ba0b1e..cbcfa40a66 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -215,6 +215,44 @@ ALF_FUNCS(16, 12, avx2)
#endif
+#define SAO_FILTER_FUNC(wd, bitd, opt) \
+void ff_vvc_sao_band_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+ const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
+void ff_vvc_sao_edge_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
+ const int16_t *sao_offset_val, int eo, int width, int height); \
+
+#define SAO_FILTER_FUNCS(bitd, opt) \
+ SAO_FILTER_FUNC(8, bitd, opt) \
+ SAO_FILTER_FUNC(16, bitd, opt) \
+ SAO_FILTER_FUNC(32, bitd, opt) \
+ SAO_FILTER_FUNC(48, bitd, opt) \
+ SAO_FILTER_FUNC(64, bitd, opt) \
+ SAO_FILTER_FUNC(80, bitd, opt) \
+ SAO_FILTER_FUNC(96, bitd, opt) \
+ SAO_FILTER_FUNC(112, bitd, opt) \
+ SAO_FILTER_FUNC(128, bitd, opt) \
+
+SAO_FILTER_FUNCS(8, avx2)
+SAO_FILTER_FUNCS(10, avx2)
+SAO_FILTER_FUNCS(12, avx2)
+
+#define SAO_FILTER_INIT(type, bitd, opt) do { \
+ c->sao.type##_filter[0] = ff_vvc_sao_##type##_filter_8_##bitd##_##opt; \
+ c->sao.type##_filter[1] = ff_vvc_sao_##type##_filter_16_##bitd##_##opt; \
+ c->sao.type##_filter[2] = ff_vvc_sao_##type##_filter_32_##bitd##_##opt; \
+ c->sao.type##_filter[3] = ff_vvc_sao_##type##_filter_48_##bitd##_##opt; \
+ c->sao.type##_filter[4] = ff_vvc_sao_##type##_filter_64_##bitd##_##opt; \
+ c->sao.type##_filter[5] = ff_vvc_sao_##type##_filter_80_##bitd##_##opt; \
+ c->sao.type##_filter[6] = ff_vvc_sao_##type##_filter_96_##bitd##_##opt; \
+ c->sao.type##_filter[7] = ff_vvc_sao_##type##_filter_112_##bitd##_##opt; \
+ c->sao.type##_filter[8] = ff_vvc_sao_##type##_filter_128_##bitd##_##opt; \
+} while (0)
+
+#define SAO_INIT(bitd, opt) do { \
+ SAO_FILTER_INIT(band, bitd, opt); \
+ SAO_FILTER_INIT(edge, bitd, opt); \
+} while (0)
+
#define AVG_INIT(bd, opt) do { \
c->inter.avg = bf(vvc_avg, bd, opt); \
c->inter.w_avg = bf(vvc_w_avg, bd, opt); \
@@ -329,6 +367,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
// filter
ALF_INIT(8);
+ SAO_INIT(8, avx2);
}
#endif
break;
@@ -350,6 +389,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
// filter
ALF_INIT(10);
+ SAO_INIT(10, avx2);
}
#endif
break;
@@ -371,6 +411,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
// filter
ALF_INIT(12);
+ SAO_INIT(12, avx2);
}
#endif
break;
diff --git a/libavcodec/x86/vvc/sao.asm b/libavcodec/x86/vvc/sao.asm
new file mode 100644
index 0000000000..5f7d7e5358
--- /dev/null
+++ b/libavcodec/x86/vvc/sao.asm
@@ -0,0 +1,73 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for VVC 8bit decoding
+;*
+;* Copyright (c) 2024 Shaun Loo
+;* Copyright (c) 2024 Nuo Mi
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%define MAX_PB_SIZE 128
+%include "libavcodec/x86/h26x/h2656_sao.asm"
+
+%macro VVC_SAO_BAND_FILTER 2
+ H2656_SAO_BAND_FILTER vvc, %1, %2
+%endmacro
+
+%macro VVC_SAO_BAND_FILTER_FUNCS 0
+VVC_SAO_BAND_FILTER 8, 0
+VVC_SAO_BAND_FILTER 16, 1
+VVC_SAO_BAND_FILTER 32, 2
+VVC_SAO_BAND_FILTER 48, 2
+VVC_SAO_BAND_FILTER 64, 4
+VVC_SAO_BAND_FILTER 80, 4
+VVC_SAO_BAND_FILTER 96, 6
+VVC_SAO_BAND_FILTER 112, 6
+VVC_SAO_BAND_FILTER 128, 8
+%endmacro
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+VVC_SAO_BAND_FILTER 8, 0
+VVC_SAO_BAND_FILTER 16, 1
+INIT_YMM avx2
+VVC_SAO_BAND_FILTER 32, 1
+VVC_SAO_BAND_FILTER 48, 1
+VVC_SAO_BAND_FILTER 64, 2
+VVC_SAO_BAND_FILTER 80, 2
+VVC_SAO_BAND_FILTER 96, 3
+VVC_SAO_BAND_FILTER 112, 3
+VVC_SAO_BAND_FILTER 128, 4
+%endif
+
+%macro VVC_SAO_EDGE_FILTER 2-3
+ H2656_SAO_EDGE_FILTER vvc, %{1:-1}
+%endmacro
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+VVC_SAO_EDGE_FILTER 8, 0
+VVC_SAO_EDGE_FILTER 16, 1, a
+INIT_YMM avx2
+VVC_SAO_EDGE_FILTER 32, 1, a
+VVC_SAO_EDGE_FILTER 48, 1, u
+VVC_SAO_EDGE_FILTER 64, 2, a
+VVC_SAO_EDGE_FILTER 80, 2, u
+VVC_SAO_EDGE_FILTER 96, 3, a
+VVC_SAO_EDGE_FILTER 112, 3, u
+VVC_SAO_EDGE_FILTER 128, 4, a
+%endif
diff --git a/libavcodec/x86/vvc/sao_10bit.asm b/libavcodec/x86/vvc/sao_10bit.asm
new file mode 100644
index 0000000000..b7d3d08008
--- /dev/null
+++ b/libavcodec/x86/vvc/sao_10bit.asm
@@ -0,0 +1,113 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for VVC 10/12bit decoding
+;*
+;* Copyright (c) 2024 Shaun Loo
+;* Copyright (c) 2024 Nuo Mi
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%define MAX_PB_SIZE 128
+%include "libavcodec/x86/h26x/h2656_sao_10bit.asm"
+
+%macro VVC_SAO_BAND_FILTER 3
+ H2656_SAO_BAND_FILTER vvc, %1, %2, %3
+%endmacro
+
+%macro VVC_SAO_BAND_FILTER_FUNCS 1
+ VVC_SAO_BAND_FILTER %1, 8, 1
+ VVC_SAO_BAND_FILTER %1, 16, 2
+ VVC_SAO_BAND_FILTER %1, 32, 4
+ VVC_SAO_BAND_FILTER %1, 48, 6
+ VVC_SAO_BAND_FILTER %1, 64, 8
+ VVC_SAO_BAND_FILTER %1, 80, 10
+ VVC_SAO_BAND_FILTER %1, 96, 12
+ VVC_SAO_BAND_FILTER %1, 112, 14
+ VVC_SAO_BAND_FILTER %1, 128, 16
+%endmacro
+
+%macro VVC_SAO_BAND_FILTER_FUNCS 0
+ VVC_SAO_BAND_FILTER_FUNCS 10
+ VVC_SAO_BAND_FILTER_FUNCS 12
+%endmacro
+
+INIT_XMM sse2
+VVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+VVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+
+%macro VVC_SAO_BAND_FILTER_FUNCS_AVX2 1
+ INIT_XMM avx2
+ VVC_SAO_BAND_FILTER %1, 8, 1
+ INIT_YMM avx2
+ VVC_SAO_BAND_FILTER %1, 16, 1
+ VVC_SAO_BAND_FILTER %1, 32, 2
+ VVC_SAO_BAND_FILTER %1, 48, 3
+ VVC_SAO_BAND_FILTER %1, 64, 4
+ VVC_SAO_BAND_FILTER %1, 80, 5
+ VVC_SAO_BAND_FILTER %1, 96, 6
+ VVC_SAO_BAND_FILTER %1, 112, 7
+ VVC_SAO_BAND_FILTER %1, 128, 8
+%endmacro
+
+VVC_SAO_BAND_FILTER_FUNCS_AVX2 10
+VVC_SAO_BAND_FILTER_FUNCS_AVX2 12
+
+%endif ; HAVE_AVX2_EXTERNAL
+
+%macro VVC_SAO_EDGE_FILTER 3
+ H2656_SAO_EDGE_FILTER vvc, %1, %2, %3
+%endmacro
+
+%macro VVC_SAO_EDGE_FILTER_FUNCS 1
+ VVC_SAO_EDGE_FILTER %1, 8, 1
+ VVC_SAO_EDGE_FILTER %1, 16, 2
+ VVC_SAO_EDGE_FILTER %1, 32, 4
+ VVC_SAO_EDGE_FILTER %1, 48, 6
+ VVC_SAO_EDGE_FILTER %1, 64, 8
+ VVC_SAO_EDGE_FILTER %1, 80, 10
+ VVC_SAO_EDGE_FILTER %1, 96, 12
+ VVC_SAO_EDGE_FILTER %1, 112, 14
+ VVC_SAO_EDGE_FILTER %1, 128, 16
+%endmacro
+
+INIT_XMM sse2
+VVC_SAO_EDGE_FILTER_FUNCS 10
+VVC_SAO_EDGE_FILTER_FUNCS 12
+
+%if HAVE_AVX2_EXTERNAL
+
+%macro VVC_SAO_EDGE_FILTER_FUNCS_AVX2 1
+ INIT_XMM avx2
+ VVC_SAO_EDGE_FILTER %1, 8, 1
+ INIT_YMM avx2
+ VVC_SAO_EDGE_FILTER %1, 16, 1
+ VVC_SAO_EDGE_FILTER %1, 32, 2
+ VVC_SAO_EDGE_FILTER %1, 48, 3
+ VVC_SAO_EDGE_FILTER %1, 64, 4
+ VVC_SAO_EDGE_FILTER %1, 80, 5
+ VVC_SAO_EDGE_FILTER %1, 96, 6
+ VVC_SAO_EDGE_FILTER %1, 112, 7
+ VVC_SAO_EDGE_FILTER %1, 128, 8
+%endmacro
+
+VVC_SAO_EDGE_FILTER_FUNCS_AVX2 10
+VVC_SAO_EDGE_FILTER_FUNCS_AVX2 12
+
+%endif ; HAVE_AVX2_EXTERNAL
--
2.34.1
More information about the ffmpeg-devel
mailing list