[FFmpeg-devel] [PATCH 4/7] x86: sbrdsp: implement SSE hf_apply_noise
Michael Niedermayer
michaelni at gmx.at
Sat Apr 6 16:56:24 CEST 2013
On Sat, Apr 06, 2013 at 10:52:11AM +0000, Christophe Gisquet wrote:
> 233 to 115(sse)/110(sse2) cycles on Arrandale and Win64.
> Replacing the multiplication by s_m[m] by an andps and an xorps with
> appropriate vectors is slower. Unrolling is a 15 cycles win.
> ---
> libavcodec/x86/sbrdsp.asm | 145 +++++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/sbrdsp_init.c | 32 ++++++++++
> 2 files changed, 177 insertions(+)
>
> diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
> index 65c972e..a7998fa 100644
> --- a/libavcodec/x86/sbrdsp.asm
> +++ b/libavcodec/x86/sbrdsp.asm
> @@ -26,6 +26,12 @@ SECTION_RODATA
> ps_mask times 2 dd 1<<31, 0
> ps_mask2 times 2 dd 0, 1<<31
> ps_neg times 4 dd 1<<31
> +ps_noise0 times 2 dd 1.0, 0.0,
> +ps_noise2 times 2 dd -1.0, 0.0
> +ps_noise13 dd 0.0, 1.0, 0.0, -1.0
> + dd 0.0, -1.0, 0.0, 1.0
> + dd 0.0, 1.0, 0.0, -1.0
> +cextern sbr_noise_table
>
> SECTION_TEXT
>
> @@ -358,3 +364,142 @@ SBR_QMF_DEINT_BFLY
>
> INIT_XMM sse2
> SBR_QMF_DEINT_BFLY
> +
> +%if WIN64
> +%define NREGS 0
> +%else
> +%ifndef PIC
> +%define NREGS 1
> +%else
> +%define NREGS 0
> +%endif
> +%endif
> +
> +%macro SBUTTERFLY_F128 3
> + vperm2i128 m%3, m%1, m%2, q0301 ; punpckh
> + vinserti128 m%1, m%1, xm%2, 1 ; punpckl
> + SWAP %2, %3
> +%endmacro
> +
> +%macro SBR_HF_APPLY_NOISE 0
> +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
> +; const float *q_filt, int noise,
> +; int kx, int m_max)
> +cglobal sbr_hf_apply_noise_0, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> + mova m0, [ps_noise0]
> + jmp apply_noise_main %+ SUFFIX
> +
> +; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
> +; const float *q_filt, int noise,
> +; int kx, int m_max)
> +cglobal sbr_hf_apply_noise_1, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> + and kxq, 1
> + shl kxq, 4
> +%if NREGS
> + lea r5q, [ps_noise13]
> + mova m0, [kxq + r5q]
> +%else
> + mova m0, [kxq + ps_noise13]
> +%endif
this could probably be simplified with some kind of generic macro
like
FOO mova, r5q, m0, ps_noise13, kxq
or something similar ...
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
During times of universal deceit, telling the truth becomes a
revolutionary act. -- George Orwell
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130406/7d826d50/attachment.asc>
More information about the ffmpeg-devel
mailing list