[FFmpeg-devel] [PATCH 1/2] lavc/aarch64: move transpose_4x4S and transpose_8x8S to neon.S

Fri Aug 20 00:31:01 EEST 2021

transpose_4x4S and transpose_8x8S were declared in vp9itxfm_16bpp_neon, however these macros are
not unique to vp9 and could be used elsewhere.

Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
---
 libavcodec/aarch64/neon.S                | 49 ++++++++++++++++++++++++
 libavcodec/aarch64/vp9itxfm_16bpp_neon.S | 49 ------------------------
 2 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 1ad32c359d..4186186185 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -160,3 +160,52 @@
         trn2            \r7\().2D,  \r9\().2D,  \r7\().2D
 
 .endm
+
+.macro transpose_4x4S r0, r1, r2, r3, r4, r5, r6, r7
+        trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
+        trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
+        trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
+        trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
+        trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
+        trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
+        trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
+        trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
+.endm
+
+// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
+// over two registers.
+.macro transpose_8x8S r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
+        transpose_4x4S  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
+        transpose_4x4S  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
+
+        // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
+        // while swapping the two 4x4 matrices between each other
+
+        // First step of the 4x4 transpose of r1-r7, into t0-t3
+        trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
+        trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
+        trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
+
+        // First step of the 4x4 transpose of r8-r12, into r1-r7
+        trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
+        trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
+        trn1            \r5\().4s,  \r12\().4s, \r14\().4s
+        trn2            \r7\().4s,  \r12\().4s, \r14\().4s
+
+        // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
+        trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
+        trn2            \r12\().2d, \t0\().2d,  \t2\().2d
+        trn1            \r10\().2d, \t1\().2d,  \t3\().2d
+        trn2            \r14\().2d, \t1\().2d,  \t3\().2d
+
+        // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
+        trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
+        trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
+        trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
+        trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
+
+        // Move the outputs of trn1 back in place
+        mov             \r1\().16b,  \t0\().16b
+        mov             \r3\().16b,  \t1\().16b
+.endm
\ No newline at end of file
diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
index 68296d9c40..a165ab3271 100644
--- a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -41,55 +41,6 @@ const iadst16_coeffs, align=4
         .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
 endconst
 
-.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
-        trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
-        trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
-        trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
-        trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
-        trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
-        trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
-        trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
-        trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
-.endm
-
-// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
-// over two registers.
-.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
-        transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
-        transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
-
-        // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
-        // while swapping the two 4x4 matrices between each other
-
-        // First step of the 4x4 transpose of r1-r7, into t0-t3
-        trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
-        trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
-        trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
-        trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
-
-        // First step of the 4x4 transpose of r8-r12, into r1-r7
-        trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
-        trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
-        trn1            \r5\().4s,  \r12\().4s, \r14\().4s
-        trn2            \r7\().4s,  \r12\().4s, \r14\().4s
-
-        // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
-        trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
-        trn2            \r12\().2d, \t0\().2d,  \t2\().2d
-        trn1            \r10\().2d, \t1\().2d,  \t3\().2d
-        trn2            \r14\().2d, \t1\().2d,  \t3\().2d
-
-        // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
-        trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
-        trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
-        trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
-        trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
-
-        // Move the outputs of trn1 back in place
-        mov             \r1\().16b,  \t0\().16b
-        mov             \r3\().16b,  \t1\().16b
-.endm
-
 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
 // in/out are .4s registers; this can do with 4 temp registers, but is
-- 
2.32.0