[FFmpeg-devel] [RFC][PATCH] DSPUtilize some functions from APE decoder

Thu Jul 3 01:45:40 CEST 2008

On Wed, 2 Jul 2008, Kostya wrote:

> I'm not satisfied with the decoding speed of APE decoder,
> so I've decided to finally dsputilize functions marked as such.

> +static void vector_int16_add_sse(int16_t * v1, int16_t * v2, int order)

sse2

> +       "movdqa  (%0),   %%xmm0 \n\t"
> +       "movdqu  (%1),   %%xmm1 \n\t"
> +       "paddw   %%xmm1, %%xmm0 \n\t"

movdqu  (%1),   %%xmm0
paddw   (%0),   %%xmm0

> +static int32_t vector_int16_scalarproduct_sse(int16_t * v1, int16_t * v2, int order)
> +{
> +    int i;
> +    int res = 0, *resp=&res;
> +
> +    asm volatile("pxor %xmm7, %xmm7 \n\t");
> +
> +    for(i = 0; i < order; i += 8){
> +        asm volatile(
> +       "movdqu   (%0),   %%xmm0 \n\t"
> +       "movdqa   (%1),   %%xmm1 \n\t"
> +       "pmaddwd  %%xmm1, %%xmm0 \n\t"
> +       "movhlps  %%xmm0, %%xmm2 \n\t"
> +
> +       "paddd    %%xmm2, %%xmm0 \n\t"
> +       "pshufd  $0x01, %%xmm0,%%xmm2 \n\t"
> +       "paddd    %%xmm2, %%xmm0 \n\t"
> +       "paddd   %%xmm0, %%xmm7 \n\t"
> +       : "+r"(v1), "+r"(v2)
> +       );
> +       v1 += 8;
> +       v2 += 8;
> +    }
> +    asm volatile("movd %%xmm7, (%0)\n\t" : "+r"(resp));
> +    return res;
> +}

horizontal sum should be outside the loop
pshuflw is faster than pshufd

--Loren Merritt