[FFmpeg-devel] [PATCH 32/34] aarch64: vp9itxfm: Reorder the idct coefficients for better pairing
Martin Storsjö
martin at martin.st
Wed Mar 8 12:01:12 EET 2017
All elements are used pairwise, except for the first one.
Previously, the 16th element was unused. Move the unused element
to the second slot, to make the later element pairs not split
across registers.
This simplifies loading only parts of the coefficients,
reducing the difference to the 16 bpp version.
This is cherrypicked from libav commit
09eb88a12e008d10a3f7a6be75d18ad98b368e68.
---
libavcodec/aarch64/vp9itxfm_neon.S | 124 ++++++++++++++++++-------------------
1 file changed, 62 insertions(+), 62 deletions(-)
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index dd9fde1..31c6e3c 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -22,7 +22,7 @@
#include "neon.S"
const itxfm4_coeffs, align=4
- .short 11585, 6270, 15137, 0
+ .short 11585, 0, 6270, 15137
iadst4_coeffs:
.short 5283, 15212, 9929, 13377
endconst
@@ -30,8 +30,8 @@ endconst
const iadst8_coeffs, align=4
.short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
idct_coeffs:
- .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
- .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
+ .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+ .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
.short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
.short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
endconst
@@ -192,14 +192,14 @@ endconst
.endm
.macro idct4 c0, c1, c2, c3
- smull v22.4s, \c1\().4h, v0.h[2]
- smull v20.4s, \c1\().4h, v0.h[1]
+ smull v22.4s, \c1\().4h, v0.h[3]
+ smull v20.4s, \c1\().4h, v0.h[2]
add v16.4h, \c0\().4h, \c2\().4h
sub v17.4h, \c0\().4h, \c2\().4h
- smlal v22.4s, \c3\().4h, v0.h[1]
+ smlal v22.4s, \c3\().4h, v0.h[2]
smull v18.4s, v16.4h, v0.h[0]
smull v19.4s, v17.4h, v0.h[0]
- smlsl v20.4s, \c3\().4h, v0.h[2]
+ smlsl v20.4s, \c3\().4h, v0.h[3]
rshrn v22.4h, v22.4s, #14
rshrn v18.4h, v18.4s, #14
rshrn v19.4h, v19.4s, #14
@@ -326,9 +326,9 @@ itxfm_func4x4 iwht, iwht
.macro idct8
dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
- dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
- dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
- dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
+ dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
+ dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
+ dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3
butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a
@@ -361,8 +361,8 @@ itxfm_func4x4 iwht, iwht
dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4]
neg v19.8h, v19.8h // v19 = out[3]
- dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] // v26,v27 = t5a, v28,v29 = t4a
- dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] // v2,v3 = t6a, v4,v5 = t7a
+ dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a
+ dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a
dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6
dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7
@@ -543,13 +543,13 @@ endfunc
function idct16
dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
- dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
- dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
- dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
- dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
- dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
- dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
- dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+ dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
+ dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
+ dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
+ dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
+ dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
+ dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+ dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
@@ -561,20 +561,20 @@ function idct16
butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
- dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
- dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
+ dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
idct16_end
endfunc
function idct16_half
dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
- dmbutterfly_h1 v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
- dmbutterfly_h1 v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
- dmbutterfly_h2 v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
- dmbutterfly_h1 v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
- dmbutterfly_h2 v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
- dmbutterfly_h1 v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
- dmbutterfly_h2 v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+ dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
+ dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
+ dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
+ dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
+ dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
+ dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+ dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
@@ -586,20 +586,20 @@ function idct16_half
butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
- dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
- dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
+ dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
idct16_end
endfunc
function idct16_quarter
- dsmull_h v24, v25, v19, v1.h[6]
- dsmull_h v4, v5, v17, v0.h[7]
- dsmull_h v7, v6, v18, v0.h[4]
- dsmull_h v30, v31, v18, v0.h[3]
+ dsmull_h v24, v25, v19, v1.h[7]
+ dsmull_h v4, v5, v17, v1.h[0]
+ dsmull_h v7, v6, v18, v0.h[5]
+ dsmull_h v30, v31, v18, v0.h[4]
neg v24.4s, v24.4s
neg v25.4s, v25.4s
- dsmull_h v29, v28, v17, v1.h[0]
- dsmull_h v26, v27, v19, v1.h[5]
+ dsmull_h v29, v28, v17, v1.h[1]
+ dsmull_h v26, v27, v19, v1.h[6]
dsmull_h v22, v23, v16, v0.h[0]
drshrn_h v24, v24, v25, #14
drshrn_h v16, v4, v5, #14
@@ -609,8 +609,8 @@ function idct16_quarter
drshrn_h v17, v26, v27, #14
drshrn_h v28, v22, v23, #14
- dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
- dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
+ dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
+ dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
neg v22.4s, v22.4s
neg v23.4s, v23.4s
drshrn_h v27, v20, v21, #14
@@ -646,16 +646,16 @@ function iadst16
dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14
ld1 {v0.8h}, [x10]
dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
- dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4] // v14,v15 = t9, v12,v13 = t8
+ dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8
dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
- dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[4], v0.h[3] // v4,v5 = t12, v6,v7 = t13
+ dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13
dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
- dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[5], v0.h[6] // v10,v11 = t11, v8,v9 = t10
+ dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10
butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0
dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
- dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5] // v12,v13 = t14, v14,v15 = t15
+ dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15
butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1
dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
@@ -663,15 +663,15 @@ function iadst16
butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2
butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3
- dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[1], v0.h[2] // v10,v11 = t13, v8,v9 = t12
- dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1] // v12,v13 = t14, v14,v15 = t15
+ dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12
+ dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15
dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
neg v29.8h, v29.8h // v29 = out[13]
- dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[1], v0.h[2] // v10,v11 = t5a, v8,v9 = t4a
- dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[2], v0.h[1] // v12,v13 = t6a, v14,v15 = t7a
+ dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a
+ dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a
butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a
butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10
@@ -1101,10 +1101,10 @@ endfunc
butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a
butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29
- dmbutterfly v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
- dmbutterfly v3, v5, v0.h[1], v0.h[2], v24, v25, v30, v31 // v3 = t19, v5 = t28
- dmbutterfly v28, v6, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
- dmbutterfly v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+ dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
+ dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28
+ dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
+ dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24
butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a
@@ -1141,10 +1141,10 @@ function idct32_odd
butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
- dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
- dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
- dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
- dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
+ dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+ dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
+ dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
idct32_end
endfunc
@@ -1167,10 +1167,10 @@ function idct32_odd_half
butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
- dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
- dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
- dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
- dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
+ dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+ dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
+ dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
idct32_end
endfunc
@@ -1198,18 +1198,18 @@ function idct32_odd_quarter
drshrn_h v6, v20, v21, #14
drshrn_h v30, v24, v25, #14
- dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[3], v0.h[4]
- dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[3], v0.h[4]
+ dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5]
+ dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5]
drshrn_h v23, v16, v17, #14
drshrn_h v24, v18, v19, #14
neg v20.4s, v20.4s
neg v21.4s, v21.4s
drshrn_h v27, v27, v26, #14
drshrn_h v20, v20, v21, #14
- dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[5], v0.h[6]
+ dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7]
drshrn_h v21, v16, v17, #14
drshrn_h v26, v18, v19, #14
- dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[5], v0.h[6]
+ dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7]
drshrn_h v25, v16, v17, #14
neg v18.4s, v18.4s
neg v19.4s, v19.4s
--
2.7.4
More information about the ffmpeg-devel
mailing list