[FFmpeg-devel] [PATCH] yasm ff_imdct_half_sse
Loren Merritt
lorenm
Thu Aug 19 04:46:20 CEST 2010
On Wed, 18 Aug 2010, Alex Converse wrote:
> The attached patch ports the ff_imdct_half_sse function to yasm. This wasn't
> done out of any desire for yasm purity. The split asm sections were
> problematic on platforms where xmm registers are callee saved (WIN64). It
> seems worthwhile to move toward supporting out mdct audio decoders on
> WIN64. It also gives fine grain control over exactly what spills when and
> and reduces fft calling overhead.
>@@ -481,3 +502,175 @@
> DECL_FFT 4, _3dn2
> DECL_FFT 4, _3dn2, _interleave
>
>+INIT_XMM
>+%define mulps mulps
>+%define addps addps
>+%define subps subps
>+%define unpcklps unpcklps
>+%define unpckhps unpckhps
undef
>+%macro PREROTATER 5
>+ ;::"r"(-2*k), "r"(2*k),
>+ ; "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)
Inline asm constraints don't make sense as comments after you remove the inline asm.
>+%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
>+ movaps xmm6, [%4+%1*2]
>+ movaps %2, [%4+%1*2+0x10]
>+ movaps %3, xmm6
>+ movaps xmm7, %2
>+ mulps xmm6, [%5+%1*1]
>+ mulps %2, [%6+%1*1]
>+ mulps %3, [%6+%1*1]
>+ mulps xmm7, [%5+%1*1]
>+ subps %2, xmm6
>+ addps %3, xmm7
>+%endmacro
Could be more cleanly ordered.
>+%macro POSROTATESHUF 5
>+ ;:"+&r"(j), "+&r"(k)
>+ ;:"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
>+post:
Labels that aren't functions should begin with "."
>+%ifdef ARCH_X86_64
>+cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
>+%define rrevtab r10
>+%define rtcos r11
>+%define rtsin r12
>+ push r10
>+ push r11
>+ push r12
>+ push r13
>+ push r14
>+%else
>+cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
duplicate line
>+ PREROTATER r4, r3, r2, rtcos, rtsin
>+%ifndef ARCH_X86_64
>+ mov r6, [esp]
>+ movzx r5, word [r6+r4*1-4]
>+ movzx r4, word [r6+r4*1-2]
>+ PREROTATEW [r1+r5*8], [r1+r4*8], xmm0
>+ movzx r5, word [r6+r3*1]
>+ movzx r4, word [r6+r3*1+2]
>+ PREROTATEW [r1+r5*8], [r1+r4*8], xmm1
>+%else
>+ movzx r5, word [rrevtab+r4*1-4]
>+ movzx r6, word [rrevtab+r4*1-2]
>+ movzx r13, word [rrevtab+r3*1]
>+ movzx r14, word [rrevtab+r3*1+2]
>+ PREROTATEW [r1+r5 *8], [r1+r6 *8], xmm0
>+ PREROTATEW [r1+r13*8], [r1+r14*8], xmm1
>+ add r4, 4
>+%endif
I prefer positive cases to come first in an if/else, and for the order to be consistent.
--Loren Merritt
More information about the ffmpeg-devel
mailing list