[FFmpeg-devel] [PATCH 1/2] lavc/aarch64: move transpose_4x4S and transpose_8x8S to neon.S
Mikhail Nitenko
mnitenko at gmail.com
Fri Aug 20 00:31:01 EEST 2021
transpose_4x4S and transpose_8x8S were declared in vp9itxfm_16bpp_neon, however these macros are
not unique to vp9 and could be used elsewhere.
Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
---
libavcodec/aarch64/neon.S | 49 ++++++++++++++++++++++++
libavcodec/aarch64/vp9itxfm_16bpp_neon.S | 49 ------------------------
2 files changed, 49 insertions(+), 49 deletions(-)
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 1ad32c359d..4186186185 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -160,3 +160,52 @@
trn2 \r7\().2D, \r9\().2D, \r7\().2D
.endm
+
+.macro transpose_4x4S r0, r1, r2, r3, r4, r5, r6, r7
+ trn1 \r4\().4s, \r0\().4s, \r1\().4s
+ trn2 \r5\().4s, \r0\().4s, \r1\().4s
+ trn1 \r6\().4s, \r2\().4s, \r3\().4s
+ trn2 \r7\().4s, \r2\().4s, \r3\().4s
+ trn1 \r0\().2d, \r4\().2d, \r6\().2d
+ trn2 \r2\().2d, \r4\().2d, \r6\().2d
+ trn1 \r1\().2d, \r5\().2d, \r7\().2d
+ trn2 \r3\().2d, \r5\().2d, \r7\().2d
+.endm
+
+// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
+// over two registers.
+.macro transpose_8x8S r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
+ transpose_4x4S \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3
+ transpose_4x4S \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3
+
+ // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
+ // while swapping the two 4x4 matrices between each other
+
+ // First step of the 4x4 transpose of r1-r7, into t0-t3
+ trn1 \t0\().4s, \r1\().4s, \r3\().4s
+ trn2 \t1\().4s, \r1\().4s, \r3\().4s
+ trn1 \t2\().4s, \r5\().4s, \r7\().4s
+ trn2 \t3\().4s, \r5\().4s, \r7\().4s
+
+ // First step of the 4x4 transpose of r8-r12, into r1-r7
+ trn1 \r1\().4s, \r8\().4s, \r10\().4s
+ trn2 \r3\().4s, \r8\().4s, \r10\().4s
+ trn1 \r5\().4s, \r12\().4s, \r14\().4s
+ trn2 \r7\().4s, \r12\().4s, \r14\().4s
+
+ // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
+ trn1 \r8\().2d, \t0\().2d, \t2\().2d
+ trn2 \r12\().2d, \t0\().2d, \t2\().2d
+ trn1 \r10\().2d, \t1\().2d, \t3\().2d
+ trn2 \r14\().2d, \t1\().2d, \t3\().2d
+
+ // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
+ trn1 \t0\().2d, \r1\().2d, \r5\().2d
+ trn2 \r5\().2d, \r1\().2d, \r5\().2d
+ trn1 \t1\().2d, \r3\().2d, \r7\().2d
+ trn2 \r7\().2d, \r3\().2d, \r7\().2d
+
+ // Move the outputs of trn1 back in place
+ mov \r1\().16b, \t0\().16b
+ mov \r3\().16b, \t1\().16b
+.endm
\ No newline at end of file
diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
index 68296d9c40..a165ab3271 100644
--- a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -41,55 +41,6 @@ const iadst16_coeffs, align=4
.short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
endconst
-.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
- trn1 \r4\().4s, \r0\().4s, \r1\().4s
- trn2 \r5\().4s, \r0\().4s, \r1\().4s
- trn1 \r6\().4s, \r2\().4s, \r3\().4s
- trn2 \r7\().4s, \r2\().4s, \r3\().4s
- trn1 \r0\().2d, \r4\().2d, \r6\().2d
- trn2 \r2\().2d, \r4\().2d, \r6\().2d
- trn1 \r1\().2d, \r5\().2d, \r7\().2d
- trn2 \r3\().2d, \r5\().2d, \r7\().2d
-.endm
-
-// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
-// over two registers.
-.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
- transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3
- transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3
-
- // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
- // while swapping the two 4x4 matrices between each other
-
- // First step of the 4x4 transpose of r1-r7, into t0-t3
- trn1 \t0\().4s, \r1\().4s, \r3\().4s
- trn2 \t1\().4s, \r1\().4s, \r3\().4s
- trn1 \t2\().4s, \r5\().4s, \r7\().4s
- trn2 \t3\().4s, \r5\().4s, \r7\().4s
-
- // First step of the 4x4 transpose of r8-r12, into r1-r7
- trn1 \r1\().4s, \r8\().4s, \r10\().4s
- trn2 \r3\().4s, \r8\().4s, \r10\().4s
- trn1 \r5\().4s, \r12\().4s, \r14\().4s
- trn2 \r7\().4s, \r12\().4s, \r14\().4s
-
- // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
- trn1 \r8\().2d, \t0\().2d, \t2\().2d
- trn2 \r12\().2d, \t0\().2d, \t2\().2d
- trn1 \r10\().2d, \t1\().2d, \t3\().2d
- trn2 \r14\().2d, \t1\().2d, \t3\().2d
-
- // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
- trn1 \t0\().2d, \r1\().2d, \r5\().2d
- trn2 \r5\().2d, \r1\().2d, \r5\().2d
- trn1 \t1\().2d, \r3\().2d, \r7\().2d
- trn2 \r7\().2d, \r3\().2d, \r7\().2d
-
- // Move the outputs of trn1 back in place
- mov \r1\().16b, \t0\().16b
- mov \r3\().16b, \t1\().16b
-.endm
-
// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
// in/out are .4s registers; this can do with 4 temp registers, but is
--
2.32.0
More information about the ffmpeg-devel
mailing list