[FFmpeg-devel] [PATCH 3/3] x86: sbrdsp: implement SSE2 hf_apply_noise
Michael Niedermayer
michaelni at gmx.at
Sat Apr 13 00:53:20 CEST 2013
On Fri, Apr 12, 2013 at 07:14:58PM +0200, Christophe Gisquet wrote:
> Hi,
>
> 2013/4/12 Michael Niedermayer <michaelni at gmx.at>:
> > Applying this or 2/3 and this i get
> > libavcodec/x86/sbrdsp.asm:357: error: (PROLOGUE:2) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:364: error: (PROLOGUE:2) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:367: error: (LOAD_NST:1) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:373: error: (PROLOGUE:2) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:380: error: (PROLOGUE:2) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:383: error: (LOAD_NST:1) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:394: error: cannot reference symbol `NREGS' in preprocessor
>
> Indeed, bad conflict resolution after rebasing I guess.
>
> Here's a fixed version, for which fate-aac runs fine on win32 and win64.
>
> --
> Christophe
> aacsbrdata.h | 6 ++-
> x86/sbrdsp.asm | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> x86/sbrdsp_init.c | 16 ++++++++
> 3 files changed, 129 insertions(+), 1 deletion(-)
> e69ce59238c11ae266ec0794bdca182af62d0b73 0002-x86-sbrdsp-implement-SSE2-hf_apply_noise.patch
> From bf49837b8cfef2652c6dfe46031b8685ed9af217 Mon Sep 17 00:00:00 2001
> From: Christophe Gisquet <christophe.gisquet at gmail.com>
> Date: Wed, 10 Apr 2013 00:42:38 +0200
> Subject: [PATCH 2/2] x86: sbrdsp: implement SSE2 hf_apply_noise
>
> 233 to 107 cycles on Arrandale and Win64.
> Replacing the multiplication by s_m[m] by a pand and a pxor with
> appropriate vectors is slower. Unrolling is a 15 cycles win.
> A SSE version was 4 cycles slower.
> ---
> libavcodec/aacsbrdata.h | 6 ++-
> libavcodec/x86/sbrdsp.asm | 108 +++++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/sbrdsp_init.c | 16 +++++++
> 3 files changed, 129 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/aacsbrdata.h b/libavcodec/aacsbrdata.h
> index dd7a827..12575ee 100644
> --- a/libavcodec/aacsbrdata.h
> +++ b/libavcodec/aacsbrdata.h
> @@ -352,7 +352,7 @@ static DECLARE_ALIGNED(32, float, sbr_qmf_window_us)[640] = {
> 0.8537385600,
> };
>
> -/* First two entries repeated at end to simplify SIMD implementations. */
> +/* First eight entries repeated at end to simplify SIMD implementations. */
> const DECLARE_ALIGNED(16, float, ff_sbr_noise_table)[][2] = {
> {-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
> { 0.14130051758487, -0.95090983575689}, {-0.47005496701697, -0.37340549728647},
> @@ -610,7 +610,11 @@ const DECLARE_ALIGNED(16, float, ff_sbr_noise_table)[][2] = {
> {-0.93412041758744, 0.41374052024363}, { 0.96063943315511, 0.93116709541280},
> { 0.97534253457837, 0.86150930812689}, { 0.99642466504163, 0.70190043427512},
> {-0.94705089665984, -0.29580042814306}, { 0.91599807087376, -0.98147830385781},
> +// Start of duplicated table
> {-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
> +{ 0.14130051758487, -0.95090983575689}, {-0.47005496701697, -0.37340549728647},
> +{ 0.80705063769351, 0.29653668284408}, {-0.38981478896926, 0.89572605717087},
> +{-0.01053049862020, -0.66959058036166}, {-0.91266367957293, -0.11522938140034},
> };
>
> #endif /* AVCODEC_AACSBRDATA_H */
> diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
> index 099382a..e0c2088 100644
> --- a/libavcodec/x86/sbrdsp.asm
> +++ b/libavcodec/x86/sbrdsp.asm
> @@ -26,6 +26,12 @@ SECTION_RODATA
> ps_mask times 2 dd 1<<31, 0
> ps_mask2 times 2 dd 0, 1<<31
> ps_neg times 4 dd 1<<31
> +ps_noise0 times 2 dd 1.0, 0.0,
> +ps_noise2 times 2 dd -1.0, 0.0
> +ps_noise13 dd 0.0, 1.0, 0.0, -1.0
> + dd 0.0, -1.0, 0.0, 1.0
> + dd 0.0, 1.0, 0.0, -1.0
> +cextern sbr_noise_table
>
> SECTION_TEXT
>
> @@ -334,3 +340,105 @@ cglobal sbr_qmf_deint_neg, 2,3,3,v,src,vrev
> cmp vq, vrevq
> jl .loop
> REP_RET
> +
> +%if WIN64
> +%define NREGS 0
> +%elifdef PIC
> +%define NREGS 1
> +%else
> +%define NREGS 0
> +%endif
> +
> +%macro LOAD_NST 1
> +%if NREGS
> + lea r5q, [%1]
> + mova m0, [kxq + r5q]
> +%else
> + mova m0, [kxq + %1]
> +%endif
> +%endmacro
> +
> +INIT_XMM sse2
> +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
> +; const float *q_filt, int noise,
> +; int kx, int m_max)
> +cglobal sbr_hf_apply_noise_0, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> + mova m0, [ps_noise0]
> + jmp apply_noise_main
> +
> +; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
> +; const float *q_filt, int noise,
> +; int kx, int m_max)
> +cglobal sbr_hf_apply_noise_1, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> + and kxq, 1
> + shl kxq, 4
> + LOAD_NST ps_noise13
> + jmp apply_noise_main
> +
> +; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
> +; const float *q_filt, int noise,
> +; int kx, int m_max)
> +cglobal sbr_hf_apply_noise_2, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> + mova m0, [ps_noise2]
> + jmp apply_noise_main
> +
> +; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
> +; const float *q_filt, int noise,
> +; int kx, int m_max)
> +cglobal sbr_hf_apply_noise_3, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> + and kxq, 1
> + shl kxq, 4
> + LOAD_NST ps_noise13+16
> +
> +apply_noise_main:
> +%if ARCH_X86_64 == 0 || WIN64
> + mov kxd, m_maxm
> +%define count kxq
> +%else
> +%define count m_maxq
> +%endif
> + dec noiseq
> + shl count, 2
> +%if NREGS
> + lea r5q, [sbr_noise_table]
count and r5q end being the same register here on x86_64 linux shared
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
While the State exists there can be no freedom; when there is freedom there
will be no State. -- Vladimir Lenin
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130413/089247ce/attachment.asc>
More information about the ffmpeg-devel
mailing list