[FFmpeg-devel] [PATCH 4/6] x86: hevc_mc: improve EPEL_LOAD for 1D cases
Christophe Gisquet
christophe.gisquet at gmail.com
Sun Jun 1 16:13:00 CEST 2014
2 regs can be saved by:
- not lea an already effective address
- immediatly increasing src pointer, instead of storing an offset
3 lines away, and removing the increment from end of loop
---
libavcodec/x86/hevc_mc.asm | 63 ++++++++++++++++++++++++----------------------
1 file changed, 33 insertions(+), 30 deletions(-)
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index b28dea3..6db3a2a 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -161,8 +161,10 @@ QPEL_TABLE 10, 4, w, sse4
movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
%endmacro
-%macro EPEL_LOAD 4
-%ifdef PIC
+%macro EPEL_LOAD 4-5
+%if %0 == 5
+ %define rfilterq %2
+%elifdef PIC
lea rfilterq, [%2]
%else
%define rfilterq %2
@@ -175,8 +177,13 @@ QPEL_TABLE 10, 4, w, sse4
%else
movdqu m1, [rfilterq+ %3q] ;load 128bit of x+stride
movdqu m2, [rfilterq+2*%3q] ;load 128bit of x+2*stride
+%if %0 == 5
+ add rfilterq, %3q
+ movdqu m3, [rfilterq+2*%3q] ;load 128bit of x+2*stride
+%else
movdqu m3, [rfilterq+r3srcq] ;load 128bit of x+2*stride
%endif
+%endif
%if %1 == 8
%if %4 > 8
@@ -553,21 +560,19 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstrid
cglobal hevc_put_hevc_epel_h%1_%2, 6, 6, 6, dst, dststride, src, srcstride, height, mx
%assign %%stride ((%2 + 7)/8)
EPEL_FILTER %2, mx
-%define rfilterq mxq
.loop
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1, 1
EPEL_COMPUTE %2, %1, m4, m5
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, dststride, src, srcstride
RET
-cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx
+cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, mx
%assign %%stride ((%2 + 7)/8)
movdqa m6, [pw_%2]
EPEL_FILTER %2, mx
-%define rfilterq mxq
.loop
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1, 1
EPEL_COMPUTE %2, %1, m4, m5
UNI_COMPUTE %1, %2, m0, m1, m6
PEL_%2STORE%1 dstq, m0, m1
@@ -577,12 +582,13 @@ cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride,
jnz .loop ; height loop
RET
-cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 8, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx
+ mov heightd, mxm
movdqa m6, [pw_bi_%2]
- EPEL_FILTER %2, mx
-%define rfilterq mxq
+ EPEL_FILTER %2, height
+ mov heightd, heightm
.loop
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1, 1
EPEL_COMPUTE %2, %1, m4, m5
SIMPLE_BILOAD %1, src2q, m2, m3
BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
@@ -601,51 +607,48 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 8, 7, dst, dststride, src, srcstride, s
; int16_t* mcbuffer)
; ******************************
-cglobal hevc_put_hevc_epel_v%1_%2, 5, 7, 6, dst, dststride, src, srcstride, height, r3src, my
- mov myd, mym
- EPEL_FILTER %2, my
- lea r3srcq, [srcstrideq*3]
+cglobal hevc_put_hevc_epel_v%1_%2, 5, 6, 6, dst, dststride, src, srcstride, height, r3src, my
+ mov r5d, mym
+ EPEL_FILTER %2, r5
sub srcq, srcstrideq
-%define rfilterq myq
.loop
- EPEL_LOAD %2, srcq, srcstride, %1
+ EPEL_LOAD %2, srcq, srcstride, %1, 1
EPEL_COMPUTE %2, %1, m4, m5
PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, dststride, src, srcstride
+ lea dstq, [dstq+2*dststrideq]
+ dec heightd ; cmp height
+ jnz .loop ; height loop
RET
-cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, 7, dst, dststride, src, srcstride, height, r3src, my
- mov myd, mym
- EPEL_FILTER %2, my
- lea r3srcq, [srcstrideq*3]
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 6, 7, dst, dststride, src, srcstride, height, r3src, my
+ mov r5d, mym
+ EPEL_FILTER %2, r5
movdqa m6, [pw_%2]
sub srcq, srcstrideq
.loop
- EPEL_LOAD %2, srcq, srcstride, %1
+ EPEL_LOAD %2, srcq, srcstride, %1, 1
EPEL_COMPUTE %2, %1, m4, m5
UNI_COMPUTE %1, %2, m0, m1, m6
PEL_%2STORE%1 dstq, m0, m1
lea dstq, [dstq+dststrideq] ; dst += dststride
- lea srcq, [srcq+srcstrideq] ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
-cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my
- mov myd, mym
- EPEL_FILTER %2, my
- lea r3srcq, [srcstrideq*3]
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 7, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my
+ mov heightd, mym
+ EPEL_FILTER %2, height
movdqa m6, [pw_bi_%2]
+ mov heightd, heightm
sub srcq, srcstrideq
.loop
- EPEL_LOAD %2, srcq, srcstride, %1
+ EPEL_LOAD %2, srcq, srcstride, %1, 1
EPEL_COMPUTE %2, %1, m4, m5
SIMPLE_BILOAD %1, src2q, m2, m3
BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
PEL_%2STORE%1 dstq, m0, m1
lea dstq, [dstq+dststrideq] ; dst += dststride
- lea srcq, [srcq+srcstrideq] ; src += srcstride
lea src2q, [src2q+2*src2strideq] ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
--
1.8.0.msysgit.0
More information about the ffmpeg-devel
mailing list