[FFmpeg-devel] [PATCH v3 3/3] aarch64/vvc: Add dmvr
Zhao Zhili
quinkblack at foxmail.com
Thu Sep 26 18:58:12 EEST 2024
From: Zhao Zhili <zhilizhao at tencent.com>
dmvr_8_12x20_c: 1.5 ( 1.00x)
dmvr_8_12x20_neon: 0.2 ( 6.56x)
dmvr_8_20x12_c: 1.0 ( 1.00x)
dmvr_8_20x12_neon: 0.2 ( 4.33x)
dmvr_8_20x20_c: 1.7 ( 1.00x)
dmvr_8_20x20_neon: 0.5 ( 3.63x)
dmvr_12_12x20_c: 2.2 ( 1.00x)
dmvr_12_12x20_neon: 0.5 ( 4.68x)
dmvr_12_20x12_c: 2.0 ( 1.00x)
dmvr_12_20x12_neon: 0.5 ( 4.16x)
dmvr_12_20x20_c: 3.7 ( 1.00x)
dmvr_12_20x20_neon: 0.7 ( 5.14x)
---
libavcodec/aarch64/vvc/dsp_init.c | 4 ++
libavcodec/aarch64/vvc/inter.S | 87 ++++++++++++++++++++++++++++++-
2 files changed, 90 insertions(+), 1 deletion(-)
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 56226ae802..4f7ef65aa7 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -88,6 +88,8 @@ W_AVG_FUN(12)
const uint8_t *_src, ptrdiff_t _src_stride, int height, \
intptr_t mx, intptr_t my, int width);
+DMVR_FUN(, 8)
+DMVR_FUN(, 12)
DMVR_FUN(hv_, 8)
DMVR_FUN(hv_, 10)
DMVR_FUN(hv_, 12)
@@ -164,6 +166,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.avg = ff_vvc_avg_8_neon;
c->inter.w_avg = vvc_w_avg_8;
+ c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
@@ -213,6 +216,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
} else if (bd == 12) {
c->inter.avg = ff_vvc_avg_12_neon;
c->inter.w_avg = vvc_w_avg_12;
+ c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 4fc8def133..b6b079b569 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -235,7 +235,7 @@ vvc_avg w_avg, 12
* x5: intptr_t my
* w6: int width
*/
-function ff_vvc_dmvr_hv_8_neon, export=1
+function ff_vvc_dmvr_8_neon, export=1
dst .req x0
src .req x1
src_stride .req x2
@@ -243,6 +243,91 @@ function ff_vvc_dmvr_hv_8_neon, export=1
mx .req x4
my .req x5
width .req w6
+
+ sxtw x6, w6
+ mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
+ cmp width, #16
+ sub src_stride, src_stride, x6
+ cset w15, gt // width > 16
+ movi v16.8h, #2 // DMVR_SHIFT
+ sub x7, x7, x6, lsl #1
+1:
+ cbz w15, 2f
+ ldr q0, [src], #16
+ uxtl v1.8h, v0.8b
+ uxtl2 v2.8h, v0.16b
+ ushl v1.8h, v1.8h, v16.8h
+ ushl v2.8h, v2.8h, v16.8h
+ stp q1, q2, [dst], #32
+ b 3f
+2:
+ ldr d0, [src], #8
+ uxtl v1.8h, v0.8b
+ ushl v1.8h, v1.8h, v16.8h
+ str q1, [dst], #16
+3:
+ subs height, height, #1
+ ldr s3, [src], #4
+ uxtl v4.8h, v3.8b
+ ushl v4.4h, v4.4h, v16.4h
+ st1 {v4.4h}, [dst], x7
+
+ add src, src, src_stride
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_vvc_dmvr_12_neon, export=1
+ sxtw x6, w6
+ mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
+ cmp width, #16
+ sub src_stride, src_stride, x6, lsl #1
+ cset w15, gt // width > 16
+ movi v16.8h, #2 // offset4
+ sub x7, x7, x6, lsl #1
+1:
+ cbz w15, 2f
+ ldp q0, q1, [src], #32
+ uaddl v2.4s, v0.4h, v16.4h
+ uaddl2 v3.4s, v0.8h, v16.8h
+ uaddl v4.4s, v1.4h, v16.4h
+ uaddl2 v5.4s, v1.8h, v16.8h
+ ushr v2.4s, v2.4s, #2
+ ushr v3.4s, v3.4s, #2
+ ushr v4.4s, v4.4s, #2
+ ushr v5.4s, v5.4s, #2
+ uqxtn v2.4h, v2.4s
+ uqxtn2 v2.8h, v3.4s
+ uqxtn v4.4h, v4.4s
+ uqxtn2 v4.8h, v5.4s
+
+ stp q2, q4, [dst], #32
+ b 3f
+2:
+ ldr q0, [src], #16
+ uaddl v2.4s, v0.4h, v16.4h
+ uaddl2 v3.4s, v0.8h, v16.8h
+ ushr v2.4s, v2.4s, #2
+ ushr v3.4s, v3.4s, #2
+ uqxtn v2.4h, v2.4s
+ uqxtn2 v2.8h, v3.4s
+ str q2, [dst], #16
+3:
+ subs height, height, #1
+ ldr d0, [src], #8
+ uaddl v3.4s, v0.4h, v16.4h
+ ushr v3.4s, v3.4s, #2
+ uqxtn v3.4h, v3.4s
+ st1 {v3.4h}, [dst], x7
+
+ add src, src, src_stride
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_vvc_dmvr_hv_8_neon, export=1
tmp0 .req x7
tmp1 .req x8
--
2.46.0
More information about the ffmpeg-devel
mailing list