[FFmpeg-devel] [patch][OpenHEVC]added ASM functions for epel + qpel
Ronald S. Bultje
rsbultje at gmail.com
Fri Mar 7 14:21:06 CET 2014
Hi,
> +%macro EPEL_FILTER 2 ; bit depth, filter
index
> +%ifdef PIC
> + lea rfilterq, [hevc_epel_filters_sse4_%1]
> +%else
> + %define rfilterq hevc_epel_filters_sse4_%1
> +%endif
> + sub %2q, 1
> + shl %2q, 5 ; multiply by 32
> + movdqa m14, [rfilterq + %2q] ; get 2 first values
of filters
> + movdqa m15, [rfilterq + %2q+16] ; get 2 last values of
filters
> +%endmacro
lea %2q, [%2q*8-8]
movdqa m14, [rfilterq+%2q*4]
movdqa m15, [rfilterq+%2q*4+16]
or
shl %2q, 5
movdqa m14, [rfilterq+%2q-32]
movdqa m15, [rfilterq+%2q-16]
i.e. remove the sub.
> +%macro EPEL_HV_FILTER 1
> +%ifdef PIC
> + lea rfilterq, [hevc_epel_filters_sse4_%1]
> +%else
> + %define rfilterq hevc_epel_filters_sse4_%1
> +%endif
> + sub mxq, 1
> + sub myq, 1
> + shl mxq, 5 ; multiply by 32
> + shl myq, 5 ; multiply by 32
> + movdqa m14, [rfilterq + mxq] ; get 2 first values
of filters
> + movdqa m15, [rfilterq + mxq+16] ; get 2 last values of
filters
[..]
> +%ifdef PIC
> + lea rfilterq, [hevc_epel_filters_sse4_10]
> +%else
> + %define rfilterq hevc_epel_filters_sse4_10
> +%endif
> + movdqa m12, [rfilterq + myq] ; get 2 first values
of filters
> + movdqa m13, [rfilterq + myq+16] ; get 2 last values of
filters
> +%endmacro
Same, remove the subs.
> + lea r3srcq, [srcstrideq*3]
(That's a mildly weird register name, "r3src", I'd have called it
srcstride3 or so.)
> +%macro QPEL_FILTER 2
> +%ifdef PIC
> + lea rfilterq, [hevc_qpel_filters_sse4_%1]
> +%else
> + %define rfilterq hevc_qpel_filters_sse4_%1
> +%endif
> + sub %2q, 1
> + shl %2q, 6 ; multiply by 16
> + movdqa m12, [rfilterq + %2q] ; get 4 first values of
filters
> + movdqa m13, [rfilterq + %2q + 16] ; get 4 first values of
filters
> + movdqa m14, [rfilterq + %2q + 32] ; get 4 first values of
filters
> + movdqa m15, [rfilterq + %2q + 48] ; get 4 first values of
filters
> +%endmacro
Remove the sub.
> +%macro EPEL_LOAD 4
> +%ifdef PIC
> + lea rfilterq, [%2]
> +%else
> + %define rfilterq %2
> +%endif
> + movdqu m0, [rfilterq ] ;load 128bit of x
Wait, what? Why are you lea'ing here? This (%2) is an address, not a label,
the lea does nothing.
> +%ifnum %3
> + movdqu m1, [rfilterq+ %3] ;load 128bit of x+stride
> + movdqu m2, [rfilterq+2*%3] ;load 128bit of
x+2*stride
> + movdqu m3, [rfilterq+3*%3] ;load 128bit of
x+3*stride
> +%else
> + movdqu m1, [rfilterq+ %3q] ;load 128bit of x+stride
> + movdqu m2, [rfilterq+2*%3q] ;load 128bit of
x+2*stride
> + movdqu m3, [rfilterq+r3srcq] ;load 128bit of
x+2*stride
> +%endif
I think I mentioned before that if %4 <= 4, you should use movd, and if %4
== 8, you should use movq. Check agner for cycle counts, but it's faster.
> +%macro QPEL_H_LOAD 3
> +%assign %%stride (%1+7)/8
> +
> + movdqu m0, [%2-3*%%stride] ; load data from source
> + movdqu m1, [%2-2*%%stride]
> + movdqu m2, [%2-%%stride]
> + movdqu m3, [%2 ]
> + movdqu m4, [%2+%%stride]
> + movdqu m5, [%2+2*%%stride]
> + movdqu m6, [%2+3*%%stride]
> + movdqu m7, [%2+4*%%stride]
> +
if %1*%%stride <= 4, use movd; if 8, use movq.
> +%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
> +%if %2 == 8
> +%if %1 > 8
> + movhlps m1, m0
> + pmovzxbw m1, m1
> + psllw m1, 14-%2
> +%endif
> + pmovzxbw m0, m0
> +%endif
> + psllw m0, 14-%2
> +%endmacro
Ah, I see your sse4 instructions. Is this faster than a simple pxor at the
start of the loop (_only in the functions that use this macro_ - so only
fullpel) and punpcklbw/punpckhbw here (or SBUTTERFLY)? I'd say that this
being ssse3 would be a major advantage.
> +%macro HEVC_PUT_HEVC_PEL_PIXELS 2
> +cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src,
srcstride,height
> +.loop
> +%if %2 == 8
> +%if %1 == 4
> + movd m0, [srcq]
; load data from source
if %1 <= 4? Else %1 == 2 will use full movdqu.
> +cglobal hevc_put_hevc_pel_pixels24_8, 5, 5, 2, dst, dststride, src,
srcstride,height
> +.loop
> + movdqu m0, [srcq] ; load data from source
> + movhlps m1, m0
> + pmovzxbw m0, m0
> + pmovzxbw m1, m1
> + psllw m0, 6
> + psllw m1, 6
> + movdqa [dstq], m0
> + movdqa [dstq + 16], m1 ; store 16
> + movq m0, [srcq + 16]
> + pmovzxbw m0, m0
> + psllw m0, 6
> + movdqa [dstq + 32], m0
> + LOOP_END dst, dststride, src, srcstride
> + RET
Don't forget to change this to use ssse3 also (punpcklbw/punpckhbw with a
pre-loop initialized pxor, instead of movhlps+pmovzxbw).
More to come.
Ronald
More information about the ffmpeg-devel
mailing list