[FFmpeg-devel] [PATCH] avcodec/vp9: AVX2 ipred_dl_32x32 improvement
Ilia Valiakhmetov
zakne0ne at gmail.com
Mon Jul 3 14:12:50 EEST 2017
Use symmetry properties of the ipred_dl function for better performance.
vp9_diag_downleft_32x32_12bpp_c: 1534.2
vp9_diag_downleft_32x32_12bpp_sse2: 145.9
vp9_diag_downleft_32x32_12bpp_ssse3: 140.0
vp9_diag_downleft_32x32_12bpp_avx: 134.8
vp9_diag_downleft_32x32_12bpp_avx2: 78.9
~40% faster than avx
Signed-off-by: Ilia Valiakhmetov <zakne0ne at gmail.com>
---
libavcodec/x86/vp9intrapred_16bpp.asm | 47 ++++++++++++++++++++++++-----------
1 file changed, 33 insertions(+), 14 deletions(-)
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
index 8d8d65e..33a8a7f 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -901,49 +901,68 @@ cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
vperm2i128 m2, m1, m4, q0201 ; Z......555555555
vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY
- DEFINE_ARGS dst, stride, stride3, cnt
+ vperm2i128 m6, m2, m2, q0101
+ DEFINE_ARGS dst, stride, stride3, dst16, cnt
lea stride3q, [strideq*3]
- mov cntd, 4
+ lea dst16q, [dstq+strideq*8]
+ lea dst16q, [dst16q+strideq*8]
+ mov cntd, 2
.loop:
mova [dstq+strideq*0 + 0], m0
mova [dstq+strideq*0 +32], m1
+ mova [dst16q+strideq*0+ 0], m1
+ mova [dst16q+strideq*0+32], m6
vpalignr m3, m5, m0, 2
vpalignr m4, m2, m1, 2
mova [dstq+strideq*1 + 0], m3
mova [dstq+strideq*1 +32], m4
+ mova [dst16q+strideq*1 +0], m4
+ mova [dst16q+strideq*1 +32], m6
vpalignr m3, m5, m0, 4
vpalignr m4, m2, m1, 4
mova [dstq+strideq*2 + 0], m3
mova [dstq+strideq*2 +32], m4
+ mova [dst16q+strideq*2+0], m4
+ mova [dst16q+strideq*2+32], m6
vpalignr m3, m5, m0, 6
- vpalignr m4, m2, m1, 6
+ vpalignr m4, m2, m1, 6
mova [dstq+stride3q*1+ 0], m3
mova [dstq+stride3q*1+32], m4
- lea dstq, [dstq+strideq*4]
+ mova [dst16q+stride3q*1+0], m4
+ mova [dst16q+stride3q*1+32], m6
vpalignr m3, m5, m0, 8
vpalignr m4, m2, m1, 8
+ lea dstq, [dstq+strideq*4]
+ lea dst16q, [dst16q+strideq*4]
mova [dstq+strideq*0 + 0], m3
mova [dstq+strideq*0 +32], m4
+ mova [dst16q+strideq*0 +0], m4
+ mova [dst16q+strideq*0 +32], m6
vpalignr m3, m5, m0, 10
vpalignr m4, m2, m1, 10
mova [dstq+strideq*1 + 0], m3
mova [dstq+strideq*1 +32], m4
+ mova [dst16q+strideq*1 +0], m4
+ mova [dst16q+strideq*1 +32], m6
vpalignr m3, m5, m0, 12
vpalignr m4, m2, m1, 12
- mova [dstq+strideq*2+ 0], m3
- mova [dstq+strideq*2+32], m4
+ mova [dstq+strideq*2+ 0], m3
+ mova [dstq+strideq*2+32], m4
+ mova [dst16q+strideq*2+0], m4
+ mova [dst16q+strideq*2+32], m6
vpalignr m3, m5, m0, 14
vpalignr m4, m2, m1, 14
- mova [dstq+stride3q+ 0], m3
- mova [dstq+stride3q+ 32], m4
- vpalignr m3, m5, m0, 16
- vpalignr m4, m2, m1, 16
- vperm2i128 m5, m3, m4, q0201
- vperm2i128 m2, m4, m4, q0101
- mova m0, m3
- mova m1, m4
+ mova [dstq+stride3q+ 0], m3
+ mova [dstq+stride3q+ 32], m4
+ mova [dst16q+stride3q+ 0], m4
+ mova [dst16q+stride3q+32], m6
+ mova m0, m5
+ mova m1, m2
+ vperm2i128 m5, m5, m2, q0201
+ mova m2, m6
lea dstq, [dstq+strideq*4]
+ lea dst16q, [dst16q+strideq*4]
dec cntd
jg .loop
RET
--
2.8.3
More information about the ffmpeg-devel
mailing list