[FFmpeg-cvslog] avcodec/x86/vp9lpf: merge a few movs with other instructions.
Clément Bœsch
git at videolan.org
Sun Apr 20 21:29:35 CEST 2014
ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Sun Apr 20 21:19:30 2014 +0200| [f0d368d75819d552cbb5cd8ed6e51efb50d771da] | committer: Clément Bœsch
avcodec/x86/vp9lpf: merge a few movs with other instructions.
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f0d368d75819d552cbb5cd8ed6e51efb50d771da
---
libavcodec/x86/vp9lpf.asm | 79 +++++++++++++++++++++------------------------
1 file changed, 37 insertions(+), 42 deletions(-)
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index 5f4e67c..1e9c7bb 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -88,9 +88,8 @@ SECTION .text
punpck%2bw %3, %6, m0
paddw %1, %3
punpck%2bw %3, %7, m0
- paddw %1, %3
- mova %3, %1
- psraw %1, %8
+ paddw %3, %1
+ psraw %1, %3, %8
%endmacro
%macro FILTER_INIT 8 ; tmp1, tmp2, cacheL, cacheH, dstp, filterid, mask, source
@@ -154,49 +153,45 @@ SECTION .text
%endmacro
%macro FILTER6_INIT 3 ; %1=dst %2=h/l %3=cache
- punpck%2bw %3, m14, m0 ; p3: B->W
- mova %1, %3 ; p3
- paddw %1, %3 ; p3*2
- paddw %1, %3 ; p3*3
- punpck%2bw %3, m15, m0 ; p2: B->W
- paddw %1, %3 ; p3*3 + p2
- paddw %1, %3 ; p3*3 + p2*2
- punpck%2bw %3, m10, m0 ; p1: B->W
- paddw %1, %3 ; p3*3 + p2*2 + p1
- punpck%2bw %3, m11, m0 ; p0: B->W
- paddw %1, %3 ; p3*3 + p2*2 + p1 + p0
- punpck%2bw %3, m12, m0 ; q0: B->W
- paddw %1, %3 ; p3*3 + p2*2 + p1 + p0 + q0
- paddw %1, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4
- mova %3, %1 ; base for next line (cache)
- psraw %1, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
+ punpck%2bw %1, m14, m0 ; p3: B->W
+ paddw %3, %1, %1 ; p3*2
+ paddw %3, %1 ; p3*3
+ punpck%2bw %1, m15, m0 ; p2: B->W
+ paddw %3, %1 ; p3*3 + p2
+ paddw %3, %1 ; p3*3 + p2*2
+ punpck%2bw %1, m10, m0 ; p1: B->W
+ paddw %3, %1 ; p3*3 + p2*2 + p1
+ punpck%2bw %1, m11, m0 ; p0: B->W
+ paddw %3, %1 ; p3*3 + p2*2 + p1 + p0
+ punpck%2bw %1, m12, m0 ; q0: B->W
+ paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + q0
+ paddw %3, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4
+ psraw %1, %3, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
%endmacro
%macro FILTER14_INIT 3 ; %1=dst %2=h/l %3=cache
punpck%2bw %1, m2, m0 ; p7: B->W
- mova %3, %1
- psllw %1, 3 ; p7*8
- psubw %1, %3 ; p7*7
- punpck%2bw %3, m3, m0 ; p6: B->W
- paddw %1, %3 ; p7*7 + p6
- paddw %1, %3 ; p7*7 + p6*2
- punpck%2bw %3, m8, m0 ; p5: B->W
- paddw %1, %3 ; p7*7 + p6*2 + p5
- punpck%2bw %3, m9, m0 ; p4: B->W
- paddw %1, %3 ; p7*7 + p6*2 + p5 + p4
- punpck%2bw %3, m14, m0 ; p3: B->W
- paddw %1, %3 ; p7*7 + p6*2 + p5 + p4 + p3
- punpck%2bw %3, m15, m0 ; p2: B->W
- paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p2
- punpck%2bw %3, m10, m0 ; p1: B->W
- paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p1
- punpck%2bw %3, m11, m0 ; p0: B->W
- paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p0
- punpck%2bw %3, m12, m0 ; q0: B->W
- paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p0 + q0
- paddw %1, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
- mova %3, %1 ; base for next line (cache)
- psraw %1, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
+ psllw %3, %1, 3 ; p7*8
+ psubw %3, %1 ; p7*7
+ punpck%2bw %1, m3, m0 ; p6: B->W
+ paddw %3, %1 ; p7*7 + p6
+ paddw %3, %1 ; p7*7 + p6*2
+ punpck%2bw %1, m8, m0 ; p5: B->W
+ paddw %3, %1 ; p7*7 + p6*2 + p5
+ punpck%2bw %1, m9, m0 ; p4: B->W
+ paddw %3, %1 ; p7*7 + p6*2 + p5 + p4
+ punpck%2bw %1, m14, m0 ; p3: B->W
+ paddw %3, %1 ; p7*7 + p6*2 + p5 + p4 + p3
+ punpck%2bw %1, m15, m0 ; p2: B->W
+ paddw %3, %1 ; p7*7 + p6*2 + p5 + .. + p2
+ punpck%2bw %1, m10, m0 ; p1: B->W
+ paddw %3, %1 ; p7*7 + p6*2 + p5 + .. + p1
+ punpck%2bw %1, m11, m0 ; p0: B->W
+ paddw %3, %1 ; p7*7 + p6*2 + p5 + .. + p0
+ punpck%2bw %1, m12, m0 ; q0: B->W
+ paddw %3, %1 ; p7*7 + p6*2 + p5 + .. + p0 + q0
+ paddw %3, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
+ psraw %1, %3, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
%endmacro
%macro TRANSPOSE16x16B 17
More information about the ffmpeg-cvslog
mailing list