[FFmpeg-devel] [PATCH 09/14] aarch64: vp9itxfm16: Restructure the idct32 store macros

Fri Mar 17 00:10:14 EET 2017

This avoids concatenation, which can't be used if the whole macro
is wrapped within another macro.
---
 libavcodec/aarch64/vp9itxfm_16bpp_neon.S | 90 ++++++++++++++++----------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
index 86ea29e..a97c1b6 100644
--- a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -1244,27 +1244,27 @@ function idct32_1d_4x32_pass1_neon
 .macro store_rev a, b, c, d
         // There's no rev128 instruction, but we reverse each 64 bit
         // half, and then flip them using an ext with 8 bytes offset.
-        rev64           v7.4s, v\d\().4s
-        st1             {v\a\().4s},  [x0], #16
+        rev64           v7.4s, \d
+        st1             {\a},  [x0], #16
         ext             v7.16b, v7.16b, v7.16b, #8
-        st1             {v\b\().4s},  [x0], #16
-        rev64           v6.4s, v\c\().4s
-        st1             {v\c\().4s},  [x0], #16
+        st1             {\b},  [x0], #16
+        rev64           v6.4s, \c
+        st1             {\c},  [x0], #16
         ext             v6.16b, v6.16b, v6.16b, #8
-        st1             {v\d\().4s},  [x0], #16
-        rev64           v5.4s, v\b\().4s
+        st1             {\d},  [x0], #16
+        rev64           v5.4s, \b
         st1             {v7.4s},  [x0], #16
         ext             v5.16b, v5.16b, v5.16b, #8
         st1             {v6.4s},  [x0], #16
-        rev64           v4.4s, v\a\().4s
+        rev64           v4.4s, \a
         st1             {v5.4s},  [x0], #16
         ext             v4.16b, v4.16b, v4.16b, #8
         st1             {v4.4s},  [x0], #16
 .endm
-        store_rev       16, 20, 24, 28
-        store_rev       17, 21, 25, 29
-        store_rev       18, 22, 26, 30
-        store_rev       19, 23, 27, 31
+        store_rev       v16.4s, v20.4s, v24.4s, v28.4s
+        store_rev       v17.4s, v21.4s, v25.4s, v29.4s
+        store_rev       v18.4s, v22.4s, v26.4s, v30.4s
+        store_rev       v19.4s, v23.4s, v27.4s, v31.4s
         sub             x0,  x0,  #512
 .purgem store_rev
 
@@ -1290,27 +1290,27 @@ function idct32_1d_4x32_pass1_neon
         // Store the registers a, b, c, d horizontally,
         // adding into the output first, and the mirrored,
         // subtracted from the output.
-.macro store_rev a, b, c, d
+.macro store_rev a, b, c, d, a16b, b16b
         ld1             {v4.4s},  [x0]
-        rev64           v9.4s, v\d\().4s
-        add             v4.4s, v4.4s, v\a\().4s
+        rev64           v9.4s, \d
+        add             v4.4s, v4.4s, \a
         st1             {v4.4s},  [x0], #16
-        rev64           v8.4s, v\c\().4s
+        rev64           v8.4s, \c
         ld1             {v4.4s},  [x0]
         ext             v9.16b, v9.16b, v9.16b, #8
-        add             v4.4s, v4.4s, v\b\().4s
+        add             v4.4s, v4.4s, \b
         st1             {v4.4s},  [x0], #16
         ext             v8.16b, v8.16b, v8.16b, #8
         ld1             {v4.4s},  [x0]
-        rev64           v\b\().4s, v\b\().4s
-        add             v4.4s, v4.4s, v\c\().4s
+        rev64           \b, \b
+        add             v4.4s, v4.4s, \c
         st1             {v4.4s},  [x0], #16
-        rev64           v\a\().4s, v\a\().4s
+        rev64           \a, \a
         ld1             {v4.4s},  [x0]
-        ext             v\b\().16b, v\b\().16b, v\b\().16b, #8
-        add             v4.4s, v4.4s, v\d\().4s
+        ext             \b16b, \b16b, \b16b, #8
+        add             v4.4s, v4.4s, \d
         st1             {v4.4s},  [x0], #16
-        ext             v\a\().16b, v\a\().16b, v\a\().16b, #8
+        ext             \a16b, \a16b, \a16b, #8
         ld1             {v4.4s},  [x0]
         sub             v4.4s, v4.4s, v9.4s
         st1             {v4.4s},  [x0], #16
@@ -1318,17 +1318,17 @@ function idct32_1d_4x32_pass1_neon
         sub             v4.4s, v4.4s, v8.4s
         st1             {v4.4s},  [x0], #16
         ld1             {v4.4s},  [x0]
-        sub             v4.4s, v4.4s, v\b\().4s
+        sub             v4.4s, v4.4s, \b
         st1             {v4.4s},  [x0], #16
         ld1             {v4.4s},  [x0]
-        sub             v4.4s, v4.4s, v\a\().4s
+        sub             v4.4s, v4.4s, \a
         st1             {v4.4s},  [x0], #16
 .endm
 
-        store_rev       31, 27, 23, 19
-        store_rev       30, 26, 22, 18
-        store_rev       29, 25, 21, 17
-        store_rev       28, 24, 20, 16
+        store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
+        store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
+        store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
+        store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
 .purgem store_rev
         ret
 endfunc
@@ -1370,21 +1370,21 @@ function idct32_1d_4x32_pass2_neon
 .if \neg == 0
         ld1             {v4.4s},  [x2], x9
         ld1             {v5.4s},  [x2], x9
-        add             v4.4s, v4.4s, v\a\().4s
+        add             v4.4s, v4.4s, \a
         ld1             {v6.4s},  [x2], x9
-        add             v5.4s, v5.4s, v\b\().4s
+        add             v5.4s, v5.4s, \b
         ld1             {v7.4s},  [x2], x9
-        add             v6.4s, v6.4s, v\c\().4s
-        add             v7.4s, v7.4s, v\d\().4s
+        add             v6.4s, v6.4s, \c
+        add             v7.4s, v7.4s, \d
 .else
         ld1             {v4.4s},  [x2], x7
         ld1             {v5.4s},  [x2], x7
-        sub             v4.4s, v4.4s, v\a\().4s
+        sub             v4.4s, v4.4s, \a
         ld1             {v6.4s},  [x2], x7
-        sub             v5.4s, v5.4s, v\b\().4s
+        sub             v5.4s, v5.4s, \b
         ld1             {v7.4s},  [x2], x7
-        sub             v6.4s, v6.4s, v\c\().4s
-        sub             v7.4s, v7.4s, v\d\().4s
+        sub             v6.4s, v6.4s, \c
+        sub             v7.4s, v7.4s, \d
 .endif
         ld1             {v8.4h},   [x0], x1
         ld1             {v8.d}[1], [x0], x1
@@ -1410,15 +1410,15 @@ function idct32_1d_4x32_pass2_neon
         st1             {v5.4h},   [x0], x1
         st1             {v5.d}[1], [x0], x1
 .endm
-        load_acc_store  31, 30, 29, 28
-        load_acc_store  27, 26, 25, 24
-        load_acc_store  23, 22, 21, 20
-        load_acc_store  19, 18, 17, 16
+        load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
+        load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
+        load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
+        load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
         sub             x2,  x2,  x9
-        load_acc_store  16, 17, 18, 19, 1
-        load_acc_store  20, 21, 22, 23, 1
-        load_acc_store  24, 25, 26, 27, 1
-        load_acc_store  28, 29, 30, 31, 1
+        load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
+        load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
+        load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
+        load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
 .purgem load_acc_store
         ret
 endfunc
-- 
2.7.4