[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations

Sun Jun 22 15:21:53 CEST 2008

Michael Niedermayer schrieb:
> On Sat, Jun 21, 2008 at 03:37:44PM +0200, Victor Pollex wrote:
>   
>> Hi,
>> as in subject.
>>
>> Victor Pollex
>>     
>
>   
>> Index: libavcodec/i386/vc1dsp_mmx.c
>> ===================================================================
>> --- libavcodec/i386/vc1dsp_mmx.c	(Revision 13854)
>> +++ libavcodec/i386/vc1dsp_mmx.c	(Arbeitskopie)
>> @@ -1,6 +1,7 @@
>>  /*
>>   * VC-1 and WMV3 - DSP functions MMX-optimized
>>   * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
>> + * Copyright (c) 2008 Victor Pollex
>>   *
>>   * Permission is hereby granted, free of charge, to any person
>>   * obtaining a copy of this software and associated documentation
>> @@ -467,7 +468,609 @@
>>  DECLARE_FUNCTION(3, 2)
>>  DECLARE_FUNCTION(3, 3)
>>  
>> +#define LOAD_4X4(stride,base,in)\
>> +    "movq 0*"#stride"+"#base#in", %%mm0\n\t"\
>> +    "movq 1*"#stride"+"#base#in", %%mm1\n\t"\
>> +    "movq 2*"#stride"+"#base#in", %%mm2\n\t"\
>> +    "movq 3*"#stride"+"#base#in", %%mm3\n\t"
>>     
>
> duplicate of LOAD4
>   
the only LOAD4 I found is in dsputilenc_mmx.c and has a fixed stride of 
8, but I also need a stride of 16. If I missed something give me a hint.
>
>   
>> +
>> +#define STORE_4X4(stride,base,out)\
>> +    "movq %%mm0, 0*"#stride"+"#base#out"\n\t"\
>> +    "movq %%mm1, 1*"#stride"+"#base#out"\n\t"\
>> +    "movq %%mm2, 2*"#stride"+"#base#out"\n\t"\
>> +    "movq %%mm3, 3*"#stride"+"#base#out"\n\t"
>> +
>>     
>
> duplicate of STORE4
>   
same as with LOAD4, except i only need a stride of 16.
>
>   
>> +/*
>> +    precondition:
>> +        r0 = row0/col0
>> +        r1 = row1/col1
>> +        r2 = row2/col2
>> +        r3 = row3/col3
>> +
>> +    postcondition:
>> +        r0 = col0/row0
>> +        r1 = col1/row1
>> +        r2 = col2/row2
>> +        r3 = col3/row3
>> +        t0 = undefined
>> +*/
>> +#define TRANSPOSE_4X4(r0,r1,r2,r3,t0)\
>> +    "movq      "#r2", "#t0"\n\t"\
>> +    "punpcklwd "#r3", "#r2"\n\t"\
>> +    "punpckhwd "#r3", "#t0"\n\t"\
>> +    \
>> +    "movq      "#r0", "#r3"\n\t"\
>> +    "punpcklwd "#r1", "#r0"\n\t"\
>> +    "punpckhwd "#r1", "#r3"\n\t"\
>> +    \
>> +    "movq      "#r0", "#r1"\n\t"\
>> +    "punpckldq "#r2", "#r0"\n\t"\
>> +    "punpckhdq "#r2", "#r1"\n\t"\
>> +    \
>> +    "movq      "#r3", "#r2"\n\t"\
>> +    "punpckldq "#t0", "#r2"\n\t"\
>> +    "punpckhdq "#t0", "#r3"\n\t"
>>     
>
> duplicate of TRANSPOSE4
>   
basically yes, but here the output registers are the same and in the 
same order as the input registers and with TRANSPOSE4 the input 
registers are a, b, c, d and the output registers are a, d, t, c 
according to the comments. If I feel like i could try to rewrite part of 
the code to use TRANSPOSE4, but I rather try to rewrite it so that I 
don't need to transpose it in the first place.
>
>   
>> +
>> +
>> +/*
>> +    precodition:
>> +      -(2^15) <= r0 < 2^15
>> +      -(2^14) <= r1 < 2^14
>> +      -(2^15) <= r1 + r0 < 2^15
>> +    postcondition:
>> +      r0 = r1 + r0
>> +      r1 = r1 - r0
>> +*/
>> +#define TRANSFORM_COMMON_ADDSUB(r0,r1)\
>> +    "paddw "#r1", "#r0" \n\t"\
>> +    "psllw    $1, "#r1" \n\t"\
>> +    "psubw "#r0", "#r1" \n\t"
>> +
>>     
>
> duplicate of SUMSUB_BA
>   
changed it.
>
>   
>> +/*
>> +    postcondition:
>> +        r0 = [15:0](r0 + r2);
>> +        r1 = [15:0](r1 - r2);
>> +*/
>> +#define TRANSFORM_COMMON_ADD1SUB1(r0,r1,r2)\
>> +    "paddw "#r2", "#r0"\n\t" /* r0 + r2 */\
>> +    "psubw "#r2", "#r1"\n\t" /* r1 - r2 */
>>     
>
> "TRANSFORM_COMMON" says nothing about any of the macros it just
> makes them longer
>   
removed it.
>
> [...]
>   
>> +#define TRANSFORM_4X4_COMMON(r0,r1,r2,r3,r4,r5,r6,r7,c0)\
>> +    TRANSFORM_COMMON_ADDSUB(r2,r0)\
>> +    "movq     "#r0", "#r5"\n\t" /*  r0 - r2 */\
>> +    "movq     "#r2", "#r7"\n\t" /*  r0 + r2 */\
>>     
>
>   
>> +    "pcmpeqw  "#r4", "#r4"\n\t" /* -1 */\
>> +    "psllw   $"#c0", "#r4"\n\t" /* -1 << c0 */\
>>     
>
> c0 is a constant caluclating -1 << c0 at runtims is inefficient
>
>   
made static consts which are loaded.
>   
>> +    "psubw    "#r4", "#r5"\n\t" /*  r0 - r2 + (1 << c0) */\
>> +    "psubw    "#r4", "#r7"\n\t" /*  r0 + r2 + (1 << c0) */\
>> +    TRANSFORM_COMMON_SRA2(r5,r7,1)\
>> +    "movq     "#r1", "#r4"\n\t" /* r1 */\
>> +    "movq     "#r3", "#r6"\n\t" /* r3 */\
>> +    \
>>     
>
>   
>> +    "psllw       $1, "#r1"\n\t" /* 2 * r1 */\
>>     
>
> paddw is faster
>   
changed it.
>
> [...]
>   
>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>> +{
>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>> +    asm volatile(
>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>> +    TRANSFORM_8X4_ROW(0x40, (%0), %1)
>> +
>> +
>> +    LOAD_4X4(0x10, 0x00, %1)
>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> +    STORE_4X4(0x10, 0x00, %1)
>> +    LOAD_4X4(0x10, 0x40, %1)
>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> +    STORE_4X4(0x10, 0x40, %1)
>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>> +
>> +    LOAD_4X4(0x10, 0x08, %1)
>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> +    STORE_4X4(0x10, 0x08, %1)
>> +    LOAD_4X4(0x10, 0x48, %1)
>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> +    STORE_4X4(0x10, 0x48, %1)
>> +    TRANSFORM_4X8_COL(0x08, %1, (%0))
>> +    : "+r"(block), "+m"(temp)
>> +    :
>> +    : "memory"
>> +    );
>> +}
>> +
>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>> +    asm volatile(
>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>> +
>> +    LOAD_4X4(0x10, 0x00, %1)
>> +    TRANSFORM_4X4_COL
>> +    STORE_4X4(0x10, 0x00, (%0))
>> +    LOAD_4X4(0x10, 0x08, %1)
>> +    TRANSFORM_4X4_COL
>> +    STORE_4X4(0x10, 0x08, (%0))
>> +
>> +    "pxor        %%mm7,     %%mm7\n\t"
>> +    LOAD_4X4(0x08, 0x00, (%0))
>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>> +    "add %3, %2\n\t"
>> +    LOAD_4X4(0x08, 0x20, (%0))
>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>> +    : "r"(linesize)
>> +    : "memory"
>> +    );
>> +}
>> +
>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>> +    asm volatile(
>> +    LOAD_4X4(0x10, 0x00, (%0))
>> +    TRANSFORM_4X4_ROW
>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> +    STORE_4X4(0x10, 0x00, %1)
>> +    LOAD_4X4(0x10, 0x40, (%0))
>> +    TRANSFORM_4X4_ROW
>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> +    STORE_4X4(0x10, 0x40, %1)
>> +
>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>> +
>> +    "pxor     %%mm7,   %%mm7\n\t"
>> +    LOAD_4X4(0x10, 0x00, (%0))
>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>> +    "add %3, %2\n\t"
>> +    LOAD_4X4(0x10, 0x40, (%0))
>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>> +    : "r"(linesize)
>> +    : "memory"
>> +    );
>> +}
>> +
>> +static void vc1_inv_trans_4x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> +    asm volatile(
>> +    LOAD_4X4(0x10, 0x00, (%1))
>> +    TRANSFORM_4X4_ROW
>> +    TRANSFORM_4X4_COL
>> +    "pxor     %%mm7,   %%mm7\n\t"
>> +    LOAD_ADD_CLAMP_STORE_4X4(%0, %2)
>> +    : "+r"(dest)
>> +    : "r"(block), "r"(linesize)
>> +    : "memory"
>> +    );
>> +}
>>     
>
> I do not think that brute force duplicating and unrolling of all variants
> is optimal. Also benchmarks are needed for C vs, your mmx vs mmx
> code with no duplicated transforms
>
> [...]
>   
what do you mean with duplicated transforms? Do you mean that for 
example the 4x4 row transformation should be a method instead of a macro?

As for benchmarks I used START_TIMER and STOP_TIMER from libavutil with 
mingw 4.3.0-tdm3
I don't know if it is an appropiate method, but i did it somewhat like this

for(i = 0; i < (1 << 19); ++i) {
    memcpy(block1, block, 64 * sizeof(short));
    START_TIMER
    vc1_inv_trans_4x4_c(dest, 8, block1);
    STOP_TIMER("vc1_inv_trans_4x4_c")
}


4057 dezicycles in vc1_inv_trans_4x4_c, 65519 runs, 17 skips
4048 dezicycles in vc1_inv_trans_4x4_c, 131038 runs, 34 skips
4050 dezicycles in vc1_inv_trans_4x4_c, 262080 runs, 64 skips
4048 dezicycles in vc1_inv_trans_4x4_c, 524167 runs, 121 skips

1917 dezicycles in vc1_inv_trans_4x4_mmx, 65529 runs, 7 skips
1917 dezicycles in vc1_inv_trans_4x4_mmx, 131049 runs, 23 skips
1915 dezicycles in vc1_inv_trans_4x4_mmx, 262104 runs, 40 skips
1915 dezicycles in vc1_inv_trans_4x4_mmx, 524199 runs, 89 skips


7676 dezicycles in vc1_inv_trans_4x8_c, 65503 runs, 33 skips
7654 dezicycles in vc1_inv_trans_4x8_c, 131011 runs, 61 skips
7586 dezicycles in vc1_inv_trans_4x8_c, 262017 runs, 127 skips
7570 dezicycles in vc1_inv_trans_4x8_c, 524031 runs, 257 skips

3882 dezicycles in vc1_inv_trans_4x8_mmx, 65523 runs, 13 skips
3881 dezicycles in vc1_inv_trans_4x8_mmx, 131044 runs, 28 skips
3881 dezicycles in vc1_inv_trans_4x8_mmx, 262091 runs, 53 skips
3880 dezicycles in vc1_inv_trans_4x8_mmx, 524172 runs, 116 skips


9561 dezicycles in vc1_inv_trans_8x4_c, 65500 runs, 36 skips
9545 dezicycles in vc1_inv_trans_8x4_c, 131004 runs, 68 skips
9554 dezicycles in vc1_inv_trans_8x4_c, 261992 runs, 152 skips
9540 dezicycles in vc1_inv_trans_8x4_c, 523987 runs, 301 skips

3789 dezicycles in vc1_inv_trans_8x4_mmx, 65517 runs, 19 skips
3787 dezicycles in vc1_inv_trans_8x4_mmx, 131035 runs, 37 skips
3786 dezicycles in vc1_inv_trans_8x4_mmx, 262077 runs, 67 skips
3786 dezicycles in vc1_inv_trans_8x4_mmx, 524161 runs, 127 skips


7377 dezicycles in vc1_inv_trans_8x8_c, 65506 runs, 30 skips
7377 dezicycles in vc1_inv_trans_8x8_c, 131009 runs, 63 skips
7377 dezicycles in vc1_inv_trans_8x8_c, 262014 runs, 130 skips
7377 dezicycles in vc1_inv_trans_8x8_c, 524041 runs, 247 skips

7360 dezicycles in vc1_inv_trans_8x8_mmx, 65503 runs, 33 skips
7376 dezicycles in vc1_inv_trans_8x8_mmx, 131006 runs, 66 skips
7382 dezicycles in vc1_inv_trans_8x8_mmx, 262016 runs, 128 skips
7387 dezicycles in vc1_inv_trans_8x8_mmx, 524047 runs, 241 skips

-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: vc1_inv_trans_mmx_v2.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080622/663a7fa0/attachment.txt>