[FFmpeg-devel] [PATCH 1/2] lavc/aarch64: new 8-bit hevc 16x16 idct
J. Dekker
jdek at itanimul.li
Thu Jun 23 15:23:10 EEST 2022
old:
hevc_idct_16x16_8_c: 5366.2
hevc_idct_16x16_8_neon: 1493.2
new:
hevc_idct_16x16_8_c: 5363.2
hevc_idct_16x16_8_neon: 943.5
Co-developed-by: Rafal Dabrowa <fatwildcat at gmail.com>
Signed-off-by: J. Dekker <jdek at itanimul.li>
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 666 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 3 +-
2 files changed, 668 insertions(+), 1 deletion(-)
This idct is significantly faster than the one we currently have, I
suspect its for a couple reasons: 1) it's only written for 8bit 2) it's
unrolled signficantly more. It comes at a hefty cost of roughly 2.25x
the object size. I'm wondering if this idct is salvagable, or the one
we have should just be improved instead.
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..784bae33b3 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -618,3 +618,669 @@ idct_dc 16, 10
idct_dc 32, 8
idct_dc 32, 10
+
+// WIP
+
+.Lo0_coeff: .hword 83, 36, 0, 0, 0, 0, 0, 0
+.Lo8transform0: .hword 89, 75, 50, 18 // transform[4,12,20,28][0]
+.Lo8transform1: .hword 75, -18, -89, -50
+.Lo8transform2: .hword 50, -89, 18, 75
+.Lo8transform3: .hword 18, -50, 75, -89
+
+.LimitMask:
+ .hword 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0
+ .hword 0xffff, 0, 0, 0, 0, 0, 0, 0
+
+.Leo_coeff:
+ .hword 64, 64, 64, 64, 83, 36, -36, -83
+ .hword 64, -64, -64, 64, 36, -83, 83, -36
+ .hword 89, 75, 50, 18, 75, -18, -89, -50 // transform[4,12][0-3]
+ .hword 50, -89, 18, 75, 18, -50, 75, -89 // transform[20,28][0-3]
+.Lo16transform0: .hword 90, 87, 80, 70, 57, 43, 25, 9 // transform[2][0-7], also transform[2,6,10..][0]
+.Lo16transform1: .hword 87, 57, 9, -43, -80, -90, -70, -25 // transform[6][0-7]
+.Lo16transform2: .hword 80, 9, -70, -87, -25, 57, 90, 43 // transform[10][0-7]
+.Lo16transform3: .hword 70, -43, -87, 9, 90, 25, -80, -57 // transform[14][0-7]
+.Lo16transform4: .hword 57, -80, -25, 90, -9, -87, 43, 70 // transform[18][0-7]
+.Lo16transform5: .hword 43, -90, 57, 25, -87, 70, 9, -80 // transform[22][0-7]
+.Lo16transform6: .hword 25, -70, 90, -80, 43, 9, -57, 87 // transform[26][0-7]
+.Lo16transform7: .hword 9, -25, 43, -57, 70, -80, 87, -90 // transform[30][0-7]
+
+// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
+function ff_hevc_idct_16x16_8_neon_new, export=1
+ sub sp, sp, 64
+ st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+ sub sp, sp, 32
+ st1 {v14.16b, v15.16b}, [sp]
+ mov x3, 0
+ mov x2, x0
+1: mov x4, x2
+ mov x5, 32
+ ld1 {v16.8h}, [x4], x5
+ ld1 {v17.8h}, [x4], x5
+ ld1 {v18.8h}, [x4], x5
+ ld1 {v19.8h}, [x4], x5
+ ld1 {v20.8h}, [x4], x5
+ ld1 {v21.8h}, [x4], x5
+ ld1 {v22.8h}, [x4], x5
+ ld1 {v23.8h}, [x4], x5
+ ld1 {v24.8h}, [x4], x5
+ ld1 {v25.8h}, [x4], x5
+ ld1 {v26.8h}, [x4], x5
+ ld1 {v27.8h}, [x4], x5
+ ld1 {v28.8h}, [x4], x5
+ ld1 {v29.8h}, [x4], x5
+ ld1 {v30.8h}, [x4], x5
+ ld1 {v31.8h}, [x4], x5
+ cmp x1, 12
+ b.hs 5f
+ // limit2 below 16
+ bic x4, x1, 1
+ adr x5, .LimitMask
+ cbnz x3, 3f
+ // columns 0 .. 7 - cleanup of indexes 5 .. 7
+ ld1 {v0.8h}, [x5]
+ adr x5, 2f
+ add x5, x5, x4, lsl 2
+ add x5, x5, x4, lsl 1
+ br x5
+2: and v17.16b, v17.16b, v0.16b // col_limit 0..1 -> limit2 == 4..5
+ and v19.16b, v19.16b, v0.16b
+ b 5f
+ and v19.16b, v19.16b, v0.16b // col_limit 2..3 -> limit2 == 6..7
+ and v21.16b, v21.16b, v0.16b
+ b 5f
+ and v21.16b, v21.16b, v0.16b // col_limit 4..5 -> limit2 == 8..9
+ and v23.16b, v23.16b, v0.16b
+ b 5f
+ and v23.16b, v23.16b, v0.16b // col_limit 6..7 -> limit2 == 10..11
+ and v25.16b, v25.16b, v0.16b
+ b 5f
+ and v25.16b, v25.16b, v0.16b // col_limit 8..9 -> limit2 == 12..13
+ and v27.16b, v27.16b, v0.16b
+ b 5f
+ and v27.16b, v27.16b, v0.16b // col_limit 10..11 -> limit2 == 14..15
+ and v29.16b, v29.16b, v0.16b
+ b 5f
+ // columns 8 .. 15
+3: subs x4, x4, 2
+ b.lo 5f
+ ld1 {v0.8h, v1.8h}, [x5]
+ adr x5, 4f
+ add x5, x5, x4, lsl 3
+ add x5, x5, x4, lsl 1
+ br x5
+4: and v17.16b, v17.16b, v1.16b // col_limit 2..3 -> limit2 == 2..3
+ b 5f
+ nop
+ nop
+ nop
+ and v17.16b, v17.16b, v1.16b // col_limit 4..5 -> limit2 == 4..5
+ and v19.16b, v19.16b, v1.16b
+ b 5f
+ nop
+ nop
+ and v17.16b, v17.16b, v0.16b // col_limit 6..7 -> limit2 == 6..7
+ and v19.16b, v19.16b, v1.16b
+ and v21.16b, v21.16b, v1.16b
+ b 5f
+ nop
+ and v17.16b, v17.16b, v0.16b // col_limit 8..9 -> limit2 == 8..9
+ and v19.16b, v19.16b, v0.16b
+ and v21.16b, v21.16b, v1.16b
+ and v23.16b, v23.16b, v1.16b
+ b 5f
+ and v19.16b, v19.16b, v0.16b // col_limit 10..11 -> limit2 == 10..11
+ and v21.16b, v21.16b, v0.16b
+ and v23.16b, v23.16b, v1.16b
+ and v25.16b, v25.16b, v1.16b
+ b 5f
+5: adr x4, .Lo0_coeff
+ ld1 {v14.8h}, [x4]
+
+ // v0,v1 = e0
+ sshll v0.4s, v16.4h, 6
+ sshll v1.4s, v24.4h, 6
+ add v0.4s, v0.4s, v1.4s
+ sshll2 v1.4s, v16.8h, 6
+ sshll2 v2.4s, v24.8h, 6
+ add v1.4s, v1.4s, v2.4s
+
+ // v2,v3 = o0
+ smull v2.4s, v20.4h, v14.h[0]
+ smlal v2.4s, v28.4h, v14.h[1]
+ smull2 v3.4s, v20.8h, v14.h[0]
+ smlal2 v3.4s, v28.8h, v14.h[1]
+
+ // v4,v5 = e_8[0]
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+
+ // v6,v7 = e_8[3]
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+
+
+ // v0,v1 = o_8[0]
+ adr x4, .Lo8transform0
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[0]
+ add v2.4s, v4.4s, v0.4s
+ add v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[0]
+ adr x4, .Lo16transform0
+ ld1 {v15.8h}, [x4]
+
+ mov x5, 16
+ cmp x1, 12
+ b.hs 6f
+ add x5, x1, 4
+ bic x5, x5, 1
+ cbz x3, 6f
+ orr x5, x1, 1
+ subs x5, x5, 2
+ csel x5, x5, xzr, hs
+6: mov x4, 64
+ sub x6, x4, x5, lsl 2
+ adr x5, 7f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+7: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[0 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ st1 {v10.8h}, [x2]
+
+ // tmp[15 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 15 * 32
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[7]
+ sub v2.4s, v4.4s, v0.4s
+ sub v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[7]
+ adr x4, .Lo16transform7
+ ld1 {v15.8h}, [x4]
+ adr x5, 8f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+8: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[7 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 7 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[8 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 8 * 32
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = o_8[3]
+ adr x4, .Lo8transform3
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[3]
+ add v2.4s, v6.4s, v0.4s
+ add v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[3]
+ adr x4, .Lo16transform3
+ ld1 {v15.8h}, [x4]
+ adr x5, 9f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+9: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6] // 13
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5] // 11
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4] // 9
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3] // 7
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2] // 5
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1] // 3
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0] // 1
+
+ // tmp[3 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 3 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[12 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 12 * 32
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[4]
+ sub v2.4s, v6.4s, v0.4s
+ sub v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[4]
+ adr x4, .Lo16transform4
+ ld1 {v15.8h}, [x4]
+ adr x5, 10f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+10: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[4 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 4 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[11 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 11 * 32
+ st1 {v10.8h}, [x4]
+
+
+ // v0,v1 = e1
+ sshll v0.4s, v16.4h, 6
+ sshll v1.4s, v24.4h, 6
+ sub v0.4s, v0.4s, v1.4s
+ sshll2 v1.4s, v16.8h, 6
+ sshll2 v2.4s, v24.8h, 6
+ sub v1.4s, v1.4s, v2.4s
+
+ // v2,v3 = o1
+ smull v2.4s, v20.4h, v14.h[1]
+ smlsl v2.4s, v28.4h, v14.h[0]
+ smull2 v3.4s, v20.8h, v14.h[1]
+ smlsl2 v3.4s, v28.8h, v14.h[0]
+
+ // v4,v5 = e_8[1]
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+
+ // v6,v7 = e_8[2]
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+
+ // v0,v1 = o_8[1]
+ adr x4, .Lo8transform1
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[1]
+ add v2.4s, v4.4s, v0.4s
+ add v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[1]
+ adr x4, .Lo16transform1
+ ld1 {v15.8h}, [x4]
+ adr x5, 11f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+11: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[1 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 1 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[14 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 14 * 32
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[6]
+ sub v2.4s, v4.4s, v0.4s
+ sub v3.4s, v5.4s, v1.4s
+
+ // v8,v9 = o_16[6]
+ adr x4, .Lo16transform6
+ ld1 {v15.8h}, [x4]
+ adr x5, 12f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+12: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[6 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 6 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[9 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 9 * 32
+ st1 {v10.8h}, [x4]
+
+ // v0,v1 = o_8[2]
+ adr x4, .Lo8transform2
+ ld1 {v15.4h}, [x4]
+ smull v0.4s, v18.4h, v15.h[0]
+ smlal v0.4s, v22.4h, v15.h[1]
+ smlal v0.4s, v26.4h, v15.h[2]
+ smlal v0.4s, v30.4h, v15.h[3]
+ smull2 v1.4s, v18.8h, v15.h[0]
+ smlal2 v1.4s, v22.8h, v15.h[1]
+ smlal2 v1.4s, v26.8h, v15.h[2]
+ smlal2 v1.4s, v30.8h, v15.h[3]
+
+ // v2,v3 = e_16[2]
+ add v2.4s, v6.4s, v0.4s
+ add v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[2]
+ adr x4, .Lo16transform2
+ ld1 {v15.8h}, [x4]
+ adr x5, 13f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+13: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[2 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 2 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[13 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 13 * 32
+ st1 {v10.8h}, [x4]
+
+ // v2,v3 = e_16[5]
+ sub v2.4s, v6.4s, v0.4s
+ sub v3.4s, v7.4s, v1.4s
+
+ // v8,v9 = o_16[5]
+ adr x4, .Lo16transform5
+ ld1 {v15.8h}, [x4]
+ adr x5, 14f
+ add x5, x5, x6
+ movi v8.4s, 0
+ movi v9.4s, 0
+ br x5
+14: smlal2 v9.4s, v31.8h, v15.h[7]
+ smlal v8.4s, v31.4h, v15.h[7]
+ smlal2 v9.4s, v29.8h, v15.h[6]
+ smlal v8.4s, v29.4h, v15.h[6]
+ smlal2 v9.4s, v27.8h, v15.h[5]
+ smlal v8.4s, v27.4h, v15.h[5]
+ smlal2 v9.4s, v25.8h, v15.h[4]
+ smlal v8.4s, v25.4h, v15.h[4]
+ smlal2 v9.4s, v23.8h, v15.h[3]
+ smlal v8.4s, v23.4h, v15.h[3]
+ smlal2 v9.4s, v21.8h, v15.h[2]
+ smlal v8.4s, v21.4h, v15.h[2]
+ smlal2 v9.4s, v19.8h, v15.h[1]
+ smlal v8.4s, v19.4h, v15.h[1]
+ smlal2 v9.4s, v17.8h, v15.h[0]
+ smlal v8.4s, v17.4h, v15.h[0]
+
+ // tmp[5 * 16]
+ add v10.4s, v2.4s, v8.4s
+ add v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 5 * 32
+ st1 {v10.8h}, [x4]
+
+ // tmp[10 * 16]
+ sub v10.4s, v2.4s, v8.4s
+ sub v11.4s, v3.4s, v9.4s
+ sqrshrn v10.4h, v10.4s, 7
+ sqrshrn2 v10.8h, v11.4s, 7
+ add x4, x2, 10 * 32
+ st1 {v10.8h}, [x4]
+
+ add x2, x2, 16
+ add x3, x3, 1
+ cmp x3, 2
+ b.lo 1b
+
+
+ // horizontal transform
+ adr x4, .Leo_coeff
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+ // o_16 jump address
+ mov x4, 64
+ bic x5, x1, 1
+ subs x4, x4, x5, lsl 2
+ csel x4, x4, xzr, hs
+ adr x5, 15f
+ add x5, x5, x4
+
+ mov x3, 16
+14: ld1 {v6.8h, v7.8h}, [x0]
+
+ // v2 = e_8
+ smull v2.4s, v16.4h, v6.h[0]
+ smlal2 v2.4s, v16.8h, v6.h[4]
+ smlal v2.4s, v17.4h, v7.h[0]
+ smlal2 v2.4s, v17.8h, v7.h[4]
+
+ // v3 = o_8
+ smull v3.4s, v18.4h, v6.h[2]
+ smlal2 v3.4s, v18.8h, v6.h[6]
+ smlal v3.4s, v19.4h, v7.h[2]
+ smlal2 v3.4s, v19.8h, v7.h[6]
+
+ // v0,v1 = e_16
+ add v0.4s, v2.4s, v3.4s
+ sub v2.4s, v2.4s, v3.4s
+ mov v1.d[0], v2.d[1]
+ mov v1.d[1], v2.d[0]
+ rev64 v1.4s, v1.4s
+
+ // v2,v3 = o_16
+ movi v2.4s, 0
+ movi v3.4s, 0
+ br x5
+15: smlal v2.4s, v27.4h, v7.h[7]
+ smlal2 v3.4s, v27.8h, v7.h[7]
+ smlal v2.4s, v26.4h, v7.h[5]
+ smlal2 v3.4s, v26.8h, v7.h[5]
+ smlal v2.4s, v25.4h, v7.h[3]
+ smlal2 v3.4s, v25.8h, v7.h[3]
+ smlal v2.4s, v24.4h, v7.h[1]
+ smlal2 v3.4s, v24.8h, v7.h[1]
+ smlal v2.4s, v23.4h, v6.h[7]
+ smlal2 v3.4s, v23.8h, v6.h[7]
+ smlal v2.4s, v22.4h, v6.h[5]
+ smlal2 v3.4s, v22.8h, v6.h[5]
+ smlal v2.4s, v21.4h, v6.h[3]
+ smlal2 v3.4s, v21.8h, v6.h[3]
+ smlal v2.4s, v20.4h, v6.h[1]
+ smlal2 v3.4s, v20.8h, v6.h[1]
+
+ // coeff
+ add v4.4s, v0.4s, v2.4s
+ add v5.4s, v1.4s, v3.4s
+ sub v6.4s, v0.4s, v2.4s
+ sub v7.4s, v1.4s, v3.4s
+ sqrshrn v4.4h, v4.4s, 12
+ sqrshrn2 v4.8h, v5.4s, 12
+ sqrshrn v6.4h, v6.4s, 12
+ sqrshrn2 v6.8h, v7.4s, 12
+ mov v5.d[0], v6.d[1]
+ mov v5.d[1], v6.d[0]
+ rev64 v5.8h, v5.8h
+ st1 {v4.8h, v5.8h}, [x0], 32
+ subs x3, x3, 1
+ b.ne 14b
+
+ ld1 {v14.16b, v15.16b}, [sp], 32
+ ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
+ ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 2002530266..612ebb9541 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -45,6 +45,7 @@ void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_8_neon_new(int16_t *coeffs, int col_limit);
void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
@@ -72,7 +73,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon;
c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon;
c->idct[1] = ff_hevc_idct_8x8_8_neon;
- c->idct[2] = ff_hevc_idct_16x16_8_neon;
+ c->idct[2] = ff_hevc_idct_16x16_8_neon_new;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
--
2.32.0 (Apple Git-132)
More information about the ffmpeg-devel
mailing list