[FFmpeg-devel] [PATCH] yasm ff_imdct_half_sse

Thu Aug 19 04:46:20 CEST 2010

On Wed, 18 Aug 2010, Alex Converse wrote:

> The attached patch ports the ff_imdct_half_sse function to yasm. This wasn't
> done out of any desire for yasm purity. The split asm sections were
> problematic on platforms where xmm registers are callee saved (WIN64). It
> seems worthwhile to move toward supporting out mdct audio decoders on
> WIN64. It also gives fine grain control over exactly what spills when and
> and reduces fft calling overhead.

>@@ -481,3 +502,175 @@
> DECL_FFT 4, _3dn2
> DECL_FFT 4, _3dn2, _interleave
> 
>+INIT_XMM
>+%define mulps mulps
>+%define addps addps
>+%define subps subps
>+%define unpcklps unpcklps
>+%define unpckhps unpckhps

undef

>+%macro PREROTATER 5
>+            ;::"r"(-2*k), "r"(2*k),
>+            ;  "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)

Inline asm constraints don't make sense as comments after you remove the inline asm.

>+%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
>+    movaps   xmm6, [%4+%1*2]
>+    movaps   %2,   [%4+%1*2+0x10]
>+    movaps   %3,   xmm6
>+    movaps   xmm7, %2
>+    mulps    xmm6, [%5+%1*1]
>+    mulps    %2,   [%6+%1*1]
>+    mulps    %3,   [%6+%1*1]
>+    mulps    xmm7, [%5+%1*1]
>+    subps    %2,   xmm6
>+    addps    %3,   xmm7
>+%endmacro

Could be more cleanly ordered.

>+%macro POSROTATESHUF 5
>+        ;:"+&r"(j), "+&r"(k)
>+        ;:"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
>+post:

Labels that aren't functions should begin with "."

>+%ifdef ARCH_X86_64
>+cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
>+%define rrevtab r10
>+%define rtcos   r11
>+%define rtsin   r12
>+    push  r10
>+    push  r11
>+    push  r12
>+    push  r13
>+    push  r14
>+%else
>+cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input

duplicate line

>+    PREROTATER r4, r3, r2, rtcos, rtsin
>+%ifndef ARCH_X86_64

>+    mov    r6, [esp]
>+    movzx  r5, word [r6+r4*1-4]
>+    movzx  r4, word [r6+r4*1-2]
>+    PREROTATEW [r1+r5*8], [r1+r4*8], xmm0
>+    movzx  r5, word [r6+r3*1]
>+    movzx  r4, word [r6+r3*1+2]
>+    PREROTATEW [r1+r5*8], [r1+r4*8], xmm1
>+%else
>+    movzx  r5,  word [rrevtab+r4*1-4]
>+    movzx  r6,  word [rrevtab+r4*1-2]
>+    movzx  r13, word [rrevtab+r3*1]
>+    movzx  r14, word [rrevtab+r3*1+2]
>+    PREROTATEW [r1+r5 *8], [r1+r6 *8], xmm0
>+    PREROTATEW [r1+r13*8], [r1+r14*8], xmm1
>+    add    r4, 4
>+%endif

I prefer positive cases to come first in an if/else, and for the order to be consistent.

--Loren Merritt