[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations

Sat Jun 28 12:31:41 CEST 2008

Michael Niedermayer schrieb:
> On Sun, Jun 22, 2008 at 03:21:53PM +0200, Victor Pollex wrote:
>   
>> Michael Niedermayer schrieb:
>>     
>>> On Sat, Jun 21, 2008 at 03:37:44PM +0200, Victor Pollex wrote:
>>>   
>>>       
>>>> Hi,
>>>> as in subject.
>>>>
>>>> Victor Pollex
>>>>     
>>>>         
>>>   
>>>       
>>>> Index: libavcodec/i386/vc1dsp_mmx.c
>>>> ===================================================================
>>>> --- libavcodec/i386/vc1dsp_mmx.c	(Revision 13854)
>>>> +++ libavcodec/i386/vc1dsp_mmx.c	(Arbeitskopie)
>>>> @@ -1,6 +1,7 @@
>>>>  /*
>>>>   * VC-1 and WMV3 - DSP functions MMX-optimized
>>>>   * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
>>>> + * Copyright (c) 2008 Victor Pollex
>>>>   *
>>>>   * Permission is hereby granted, free of charge, to any person
>>>>   * obtaining a copy of this software and associated documentation
>>>> @@ -467,7 +468,609 @@
>>>>  DECLARE_FUNCTION(3, 2)
>>>>  DECLARE_FUNCTION(3, 3)
>>>>  +#define LOAD_4X4(stride,base,in)\
>>>> +    "movq 0*"#stride"+"#base#in", %%mm0\n\t"\
>>>> +    "movq 1*"#stride"+"#base#in", %%mm1\n\t"\
>>>> +    "movq 2*"#stride"+"#base#in", %%mm2\n\t"\
>>>> +    "movq 3*"#stride"+"#base#in", %%mm3\n\t"
>>>>     
>>>>         
>>> duplicate of LOAD4
>>>   
>>>       
>> the only LOAD4 I found is in dsputilenc_mmx.c and has a fixed stride of 8, 
>> but I also need a stride of 16. If I missed something give me a hint.
>>     
>
> LOAD4 does load 4 your LOAD_4X4 loads 4, they have slightly different things
> that are hardcoded and that are settable through arguments. This doesnt make
> them less duplicates of each other. Simply add stride to the exsting LOAD4
> (in a seperate patch!) and use it.
>
>   
attached patch which moves the macros from dsputilenc_mmx.c to 
dsputil_mmx.h
> [...]
>   
>>>> +/*
>>>> +    precondition:
>>>> +        r0 = row0/col0
>>>> +        r1 = row1/col1
>>>> +        r2 = row2/col2
>>>> +        r3 = row3/col3
>>>> +
>>>> +    postcondition:
>>>> +        r0 = col0/row0
>>>> +        r1 = col1/row1
>>>> +        r2 = col2/row2
>>>> +        r3 = col3/row3
>>>> +        t0 = undefined
>>>> +*/
>>>> +#define TRANSPOSE_4X4(r0,r1,r2,r3,t0)\
>>>> +    "movq      "#r2", "#t0"\n\t"\
>>>> +    "punpcklwd "#r3", "#r2"\n\t"\
>>>> +    "punpckhwd "#r3", "#t0"\n\t"\
>>>> +    \
>>>> +    "movq      "#r0", "#r3"\n\t"\
>>>> +    "punpcklwd "#r1", "#r0"\n\t"\
>>>> +    "punpckhwd "#r1", "#r3"\n\t"\
>>>> +    \
>>>> +    "movq      "#r0", "#r1"\n\t"\
>>>> +    "punpckldq "#r2", "#r0"\n\t"\
>>>> +    "punpckhdq "#r2", "#r1"\n\t"\
>>>> +    \
>>>> +    "movq      "#r3", "#r2"\n\t"\
>>>> +    "punpckldq "#t0", "#r2"\n\t"\
>>>> +    "punpckhdq "#t0", "#r3"\n\t"
>>>>     
>>>>         
>>> duplicate of TRANSPOSE4
>>>   
>>>       
>> basically yes, but here the output registers are the same and in the same 
>> order as the input registers and with TRANSPOSE4 the input registers are a, 
>> b, c, d and the output registers are a, d, t, c according to the comments. 
>> If I feel like i could try to rewrite part of the code to use TRANSPOSE4, 
>> but I rather try to rewrite it so that I don't need to transpose it in the 
>> first place.
>>     
>
> Its entirely your decission on how to solve it as long as the solution is
> optimal speed/size/readbility wise. But i will not approve a
> patch that adds duplicated code (like TRANSPOSE4 / TRANSPOSE_4X4) when it
> is avoidable
>
>   
attached patch which now uses TRANSPOSE4, LOAD4, STORE4. I tried to 
write a 4x4 row transformation without the need to transpose, but it 
ended up being slower.
> [...]
>   
>>>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>>>> +{
>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>> +    asm volatile(
>>>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>>> +    TRANSFORM_8X4_ROW(0x40, (%0), %1)
>>>> +
>>>> +
>>>> +    LOAD_4X4(0x10, 0x00, %1)
>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> +    STORE_4X4(0x10, 0x00, %1)
>>>> +    LOAD_4X4(0x10, 0x40, %1)
>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> +    STORE_4X4(0x10, 0x40, %1)
>>>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>>>> +
>>>> +    LOAD_4X4(0x10, 0x08, %1)
>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> +    STORE_4X4(0x10, 0x08, %1)
>>>> +    LOAD_4X4(0x10, 0x48, %1)
>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> +    STORE_4X4(0x10, 0x48, %1)
>>>> +    TRANSFORM_4X8_COL(0x08, %1, (%0))
>>>> +    : "+r"(block), "+m"(temp)
>>>> +    :
>>>> +    : "memory"
>>>> +    );
>>>> +}
>>>> +
>>>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>> *block)
>>>> +{
>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>> +    asm volatile(
>>>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>>> +
>>>> +    LOAD_4X4(0x10, 0x00, %1)
>>>> +    TRANSFORM_4X4_COL
>>>> +    STORE_4X4(0x10, 0x00, (%0))
>>>> +    LOAD_4X4(0x10, 0x08, %1)
>>>> +    TRANSFORM_4X4_COL
>>>> +    STORE_4X4(0x10, 0x08, (%0))
>>>> +
>>>> +    "pxor        %%mm7,     %%mm7\n\t"
>>>> +    LOAD_4X4(0x08, 0x00, (%0))
>>>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>>> +    "add %3, %2\n\t"
>>>> +    LOAD_4X4(0x08, 0x20, (%0))
>>>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>>>> +    : "r"(linesize)
>>>> +    : "memory"
>>>> +    );
>>>> +}
>>>> +
>>>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>> *block)
>>>> +{
>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>> +    asm volatile(
>>>> +    LOAD_4X4(0x10, 0x00, (%0))
>>>> +    TRANSFORM_4X4_ROW
>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> +    STORE_4X4(0x10, 0x00, %1)
>>>> +    LOAD_4X4(0x10, 0x40, (%0))
>>>> +    TRANSFORM_4X4_ROW
>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> +    STORE_4X4(0x10, 0x40, %1)
>>>> +
>>>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>>>> +
>>>> +    "pxor     %%mm7,   %%mm7\n\t"
>>>> +    LOAD_4X4(0x10, 0x00, (%0))
>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>>> +    "add %3, %2\n\t"
>>>> +    LOAD_4X4(0x10, 0x40, (%0))
>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>>>> +    : "r"(linesize)
>>>> +    : "memory"
>>>> +    );
>>>> +}
>>>> +
>>>> +static void vc1_inv_trans_4x4_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>> *block)
>>>> +{
>>>> +    asm volatile(
>>>> +    LOAD_4X4(0x10, 0x00, (%1))
>>>> +    TRANSFORM_4X4_ROW
>>>> +    TRANSFORM_4X4_COL
>>>> +    "pxor     %%mm7,   %%mm7\n\t"
>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%0, %2)
>>>> +    : "+r"(dest)
>>>> +    : "r"(block), "r"(linesize)
>>>> +    : "memory"
>>>> +    );
>>>> +}
>>>>     
>>>>         
>>> I do not think that brute force duplicating and unrolling of all variants
>>> is optimal. Also benchmarks are needed for C vs, your mmx vs mmx
>>> code with no duplicated transforms
>>>
>>> [...]
>>>   
>>>       
>> what do you mean with duplicated transforms? Do you mean that for example 
>> the 4x4 row transformation should be a method instead of a macro?
>>     
>
> Yes (not inlined methods/function or just plain asm that uses loops/calls
> whatever), i meant this should be tried, code cache
> is a limited resource and your code after macro expansion is likely large.
> I do not know which way it would be faster of course but it should be tested.
>
>   
I tried it with not inlined methods for the 8x8 transformation, but for 
me it was slower.
>   
>> As for benchmarks I used START_TIMER and STOP_TIMER from libavutil with 
>> mingw 4.3.0-tdm3
>> I don't know if it is an appropiate method, but i did it somewhat like this
>>
>> for(i = 0; i < (1 << 19); ++i) {
>>    memcpy(block1, block, 64 * sizeof(short));
>>    START_TIMER
>>    vc1_inv_trans_4x4_c(dest, 8, block1);
>>    STOP_TIMER("vc1_inv_trans_4x4_c")
>> }
>>     
>
> The transforms should be tested in the real code that is in vc1.c
> just put START/STOP_TIMER over the call to the transform
>
>
> [...]
>
>   
these have been done with mingw 3.4.5
for the c version
19801 dezicycles in vc1_inv_trans_8x8, 1048466 runs, 110 skips
4069 dezicycles in vc1_inv_trans_4x4, 262046 runs, 98 skips
9432 dezicycles in vc1_inv_trans_4x8, 524263 runs, 25 skips
8056 dezicycles in vc1_inv_trans_8x4, 524259 runs, 29 skips
3992 dezicycles in vc1_inv_trans_4x4, 524148 runs, 140 skips
19743 dezicycles in vc1_inv_trans_8x8, 2096925 runs, 227 skips
9393 dezicycles in vc1_inv_trans_4x8, 1048529 runs, 47 skips
8006 dezicycles in vc1_inv_trans_8x4, 1048520 runs, 56 skips
3929 dezicycles in vc1_inv_trans_4x4, 1048312 runs, 264 skips
3893 dezicycles in vc1_inv_trans_4x4, 2096685 runs, 467 skips
7950 dezicycles in vc1_inv_trans_8x4, 2097040 runs, 112 skips
9358 dezicycles in vc1_inv_trans_4x8, 2097028 runs, 124 skips
19529 dezicycles in vc1_inv_trans_8x8, 4193864 runs, 440 skips

for the mmx version
7157 dezicycles in vc1_inv_trans_8x8, 1048200 runs, 376 skips
2567 dezicycles in vc1_inv_trans_4x4, 261770 runs, 374 skips
4454 dezicycles in vc1_inv_trans_4x8, 523938 runs, 350 skips
3949 dezicycles in vc1_inv_trans_8x4, 523877 runs, 411 skips
2532 dezicycles in vc1_inv_trans_4x4, 523793 runs, 495 skips
7159 dezicycles in vc1_inv_trans_8x8, 2096437 runs, 715 skips
4424 dezicycles in vc1_inv_trans_4x8, 1047981 runs, 595 skips
3914 dezicycles in vc1_inv_trans_8x4, 1047851 runs, 725 skips
2512 dezicycles in vc1_inv_trans_4x4, 1047902 runs, 674 skips
2501 dezicycles in vc1_inv_trans_4x4, 2096179 runs, 973 skips
3896 dezicycles in vc1_inv_trans_8x4, 2095994 runs, 1158 skips
4412 dezicycles in vc1_inv_trans_4x8, 2096082 runs, 1070 skips
7142 dezicycles in vc1_inv_trans_8x8, 4193046 runs, 1258 skips

If you are wondering why the c version of the 8x8 transformation isn't 
as fast as the mmx version because of my last post, I mistakenly 
benchmarked the mmx version twice last time. I attached a patch which 
adds the START/STOP_TIMER around the calls for someone else to check.

[...]
>   
>> +#define SRA2(r0,r1,c0)\
>> +    "psraw $"#c0", "#r0"\n\t" /* r0 >> c0 */\
>> +    "psraw $"#c0", "#r1"\n\t" /* r1 >> c0 */
>> +
>> +/*
>> +    postcondition:
>> +        r0 = r0 >> c0;
>> +        r1 = r1 >> c0;
>> +        r2 = r2 >> c0;
>> +        r3 = r3 >> c0;
>> +*/
>> +#define SRA4(r0,r1,r2,r3,c0)\
>> +    SRA2(r0,r1,c0)\
>> +    SRA2(r2,r3,c0)
>>     
>
> I think a generic macro like:
>
> OPC_SS_AB(opc, dst0, dst1, src) \
>     opc" "src","dst0"\n\t" \
>     opc" "src","dst1"\n\t"
>
> OPC_SSSS_ABCD(opc, dst0, dst1, dst2, dst3, src) \
>     OPC_SS_AB(opc, dst0, dst1, src) \
>     OPC_SS_AB(opc, dst2, dst3, src) \
>
> would be simpler
>
>
> [...]
>   
>> +DECLARE_ALIGNED_16(static const int16_t, constants[]) = {
>> +  X4( 4),
>> +  X4(64)
>> +};
>>     
>
> Also please reduce the amount of marcos in your code this hurts readability a
> lot.
> That means at least every macro that is bigger than the space it safes.
> 4,4,4,4
> 64,64,64,64
>
> vs.
>
> X4( 4),
> X4(64)
> #define X4(x) x, x, x, x
>
> being an example
>
> [...]
>   
hopefully all issues have been addressed.

-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: dsputil.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080628/4b5806e8/attachment.asc>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: vc1_inv_trans_mmx_v3.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080628/4b5806e8/attachment.txt>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: vc1_timer.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080628/4b5806e8/attachment-0001.asc>