[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations
Victor Pollex
victor.pollex
Sat Jun 28 12:31:41 CEST 2008
Michael Niedermayer schrieb:
> On Sun, Jun 22, 2008 at 03:21:53PM +0200, Victor Pollex wrote:
>
>> Michael Niedermayer schrieb:
>>
>>> On Sat, Jun 21, 2008 at 03:37:44PM +0200, Victor Pollex wrote:
>>>
>>>
>>>> Hi,
>>>> as in subject.
>>>>
>>>> Victor Pollex
>>>>
>>>>
>>>
>>>
>>>> Index: libavcodec/i386/vc1dsp_mmx.c
>>>> ===================================================================
>>>> --- libavcodec/i386/vc1dsp_mmx.c (Revision 13854)
>>>> +++ libavcodec/i386/vc1dsp_mmx.c (Arbeitskopie)
>>>> @@ -1,6 +1,7 @@
>>>> /*
>>>> * VC-1 and WMV3 - DSP functions MMX-optimized
>>>> * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
>>>> + * Copyright (c) 2008 Victor Pollex
>>>> *
>>>> * Permission is hereby granted, free of charge, to any person
>>>> * obtaining a copy of this software and associated documentation
>>>> @@ -467,7 +468,609 @@
>>>> DECLARE_FUNCTION(3, 2)
>>>> DECLARE_FUNCTION(3, 3)
>>>> +#define LOAD_4X4(stride,base,in)\
>>>> + "movq 0*"#stride"+"#base#in", %%mm0\n\t"\
>>>> + "movq 1*"#stride"+"#base#in", %%mm1\n\t"\
>>>> + "movq 2*"#stride"+"#base#in", %%mm2\n\t"\
>>>> + "movq 3*"#stride"+"#base#in", %%mm3\n\t"
>>>>
>>>>
>>> duplicate of LOAD4
>>>
>>>
>> the only LOAD4 I found is in dsputilenc_mmx.c and has a fixed stride of 8,
>> but I also need a stride of 16. If I missed something give me a hint.
>>
>
> LOAD4 does load 4 your LOAD_4X4 loads 4, they have slightly different things
> that are hardcoded and that are settable through arguments. This doesnt make
> them less duplicates of each other. Simply add stride to the exsting LOAD4
> (in a seperate patch!) and use it.
>
>
attached patch which moves the macros from dsputilenc_mmx.c to
dsputil_mmx.h
> [...]
>
>>>> +/*
>>>> + precondition:
>>>> + r0 = row0/col0
>>>> + r1 = row1/col1
>>>> + r2 = row2/col2
>>>> + r3 = row3/col3
>>>> +
>>>> + postcondition:
>>>> + r0 = col0/row0
>>>> + r1 = col1/row1
>>>> + r2 = col2/row2
>>>> + r3 = col3/row3
>>>> + t0 = undefined
>>>> +*/
>>>> +#define TRANSPOSE_4X4(r0,r1,r2,r3,t0)\
>>>> + "movq "#r2", "#t0"\n\t"\
>>>> + "punpcklwd "#r3", "#r2"\n\t"\
>>>> + "punpckhwd "#r3", "#t0"\n\t"\
>>>> + \
>>>> + "movq "#r0", "#r3"\n\t"\
>>>> + "punpcklwd "#r1", "#r0"\n\t"\
>>>> + "punpckhwd "#r1", "#r3"\n\t"\
>>>> + \
>>>> + "movq "#r0", "#r1"\n\t"\
>>>> + "punpckldq "#r2", "#r0"\n\t"\
>>>> + "punpckhdq "#r2", "#r1"\n\t"\
>>>> + \
>>>> + "movq "#r3", "#r2"\n\t"\
>>>> + "punpckldq "#t0", "#r2"\n\t"\
>>>> + "punpckhdq "#t0", "#r3"\n\t"
>>>>
>>>>
>>> duplicate of TRANSPOSE4
>>>
>>>
>> basically yes, but here the output registers are the same and in the same
>> order as the input registers and with TRANSPOSE4 the input registers are a,
>> b, c, d and the output registers are a, d, t, c according to the comments.
>> If I feel like i could try to rewrite part of the code to use TRANSPOSE4,
>> but I rather try to rewrite it so that I don't need to transpose it in the
>> first place.
>>
>
> Its entirely your decission on how to solve it as long as the solution is
> optimal speed/size/readbility wise. But i will not approve a
> patch that adds duplicated code (like TRANSPOSE4 / TRANSPOSE_4X4) when it
> is avoidable
>
>
attached patch which now uses TRANSPOSE4, LOAD4, STORE4. I tried to
write a 4x4 row transformation without the need to transpose, but it
ended up being slower.
> [...]
>
>>>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>>>> +{
>>>> + DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>> + asm volatile(
>>>> + TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>>> + TRANSFORM_8X4_ROW(0x40, (%0), %1)
>>>> +
>>>> +
>>>> + LOAD_4X4(0x10, 0x00, %1)
>>>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> + STORE_4X4(0x10, 0x00, %1)
>>>> + LOAD_4X4(0x10, 0x40, %1)
>>>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> + STORE_4X4(0x10, 0x40, %1)
>>>> + TRANSFORM_4X8_COL(0x00, %1, (%0))
>>>> +
>>>> + LOAD_4X4(0x10, 0x08, %1)
>>>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> + STORE_4X4(0x10, 0x08, %1)
>>>> + LOAD_4X4(0x10, 0x48, %1)
>>>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> + STORE_4X4(0x10, 0x48, %1)
>>>> + TRANSFORM_4X8_COL(0x08, %1, (%0))
>>>> + : "+r"(block), "+m"(temp)
>>>> + :
>>>> + : "memory"
>>>> + );
>>>> +}
>>>> +
>>>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM
>>>> *block)
>>>> +{
>>>> + DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>> + asm volatile(
>>>> + TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>>> +
>>>> + LOAD_4X4(0x10, 0x00, %1)
>>>> + TRANSFORM_4X4_COL
>>>> + STORE_4X4(0x10, 0x00, (%0))
>>>> + LOAD_4X4(0x10, 0x08, %1)
>>>> + TRANSFORM_4X4_COL
>>>> + STORE_4X4(0x10, 0x08, (%0))
>>>> +
>>>> + "pxor %%mm7, %%mm7\n\t"
>>>> + LOAD_4X4(0x08, 0x00, (%0))
>>>> + LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>>> + "add %3, %2\n\t"
>>>> + LOAD_4X4(0x08, 0x20, (%0))
>>>> + LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>>> + : "+r"(block), "+m"(temp), "+r"(dest)
>>>> + : "r"(linesize)
>>>> + : "memory"
>>>> + );
>>>> +}
>>>> +
>>>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM
>>>> *block)
>>>> +{
>>>> + DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>> + asm volatile(
>>>> + LOAD_4X4(0x10, 0x00, (%0))
>>>> + TRANSFORM_4X4_ROW
>>>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> + STORE_4X4(0x10, 0x00, %1)
>>>> + LOAD_4X4(0x10, 0x40, (%0))
>>>> + TRANSFORM_4X4_ROW
>>>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>> + STORE_4X4(0x10, 0x40, %1)
>>>> +
>>>> + TRANSFORM_4X8_COL(0x00, %1, (%0))
>>>> +
>>>> + "pxor %%mm7, %%mm7\n\t"
>>>> + LOAD_4X4(0x10, 0x00, (%0))
>>>> + LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>>> + "add %3, %2\n\t"
>>>> + LOAD_4X4(0x10, 0x40, (%0))
>>>> + LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>>> + : "+r"(block), "+m"(temp), "+r"(dest)
>>>> + : "r"(linesize)
>>>> + : "memory"
>>>> + );
>>>> +}
>>>> +
>>>> +static void vc1_inv_trans_4x4_mmx(uint8_t *dest, int linesize, DCTELEM
>>>> *block)
>>>> +{
>>>> + asm volatile(
>>>> + LOAD_4X4(0x10, 0x00, (%1))
>>>> + TRANSFORM_4X4_ROW
>>>> + TRANSFORM_4X4_COL
>>>> + "pxor %%mm7, %%mm7\n\t"
>>>> + LOAD_ADD_CLAMP_STORE_4X4(%0, %2)
>>>> + : "+r"(dest)
>>>> + : "r"(block), "r"(linesize)
>>>> + : "memory"
>>>> + );
>>>> +}
>>>>
>>>>
>>> I do not think that brute force duplicating and unrolling of all variants
>>> is optimal. Also benchmarks are needed for C vs, your mmx vs mmx
>>> code with no duplicated transforms
>>>
>>> [...]
>>>
>>>
>> what do you mean with duplicated transforms? Do you mean that for example
>> the 4x4 row transformation should be a method instead of a macro?
>>
>
> Yes (not inlined methods/function or just plain asm that uses loops/calls
> whatever), i meant this should be tried, code cache
> is a limited resource and your code after macro expansion is likely large.
> I do not know which way it would be faster of course but it should be tested.
>
>
I tried it with not inlined methods for the 8x8 transformation, but for
me it was slower.
>
>> As for benchmarks I used START_TIMER and STOP_TIMER from libavutil with
>> mingw 4.3.0-tdm3
>> I don't know if it is an appropiate method, but i did it somewhat like this
>>
>> for(i = 0; i < (1 << 19); ++i) {
>> memcpy(block1, block, 64 * sizeof(short));
>> START_TIMER
>> vc1_inv_trans_4x4_c(dest, 8, block1);
>> STOP_TIMER("vc1_inv_trans_4x4_c")
>> }
>>
>
> The transforms should be tested in the real code that is in vc1.c
> just put START/STOP_TIMER over the call to the transform
>
>
> [...]
>
>
these have been done with mingw 3.4.5
for the c version
19801 dezicycles in vc1_inv_trans_8x8, 1048466 runs, 110 skips
4069 dezicycles in vc1_inv_trans_4x4, 262046 runs, 98 skips
9432 dezicycles in vc1_inv_trans_4x8, 524263 runs, 25 skips
8056 dezicycles in vc1_inv_trans_8x4, 524259 runs, 29 skips
3992 dezicycles in vc1_inv_trans_4x4, 524148 runs, 140 skips
19743 dezicycles in vc1_inv_trans_8x8, 2096925 runs, 227 skips
9393 dezicycles in vc1_inv_trans_4x8, 1048529 runs, 47 skips
8006 dezicycles in vc1_inv_trans_8x4, 1048520 runs, 56 skips
3929 dezicycles in vc1_inv_trans_4x4, 1048312 runs, 264 skips
3893 dezicycles in vc1_inv_trans_4x4, 2096685 runs, 467 skips
7950 dezicycles in vc1_inv_trans_8x4, 2097040 runs, 112 skips
9358 dezicycles in vc1_inv_trans_4x8, 2097028 runs, 124 skips
19529 dezicycles in vc1_inv_trans_8x8, 4193864 runs, 440 skips
for the mmx version
7157 dezicycles in vc1_inv_trans_8x8, 1048200 runs, 376 skips
2567 dezicycles in vc1_inv_trans_4x4, 261770 runs, 374 skips
4454 dezicycles in vc1_inv_trans_4x8, 523938 runs, 350 skips
3949 dezicycles in vc1_inv_trans_8x4, 523877 runs, 411 skips
2532 dezicycles in vc1_inv_trans_4x4, 523793 runs, 495 skips
7159 dezicycles in vc1_inv_trans_8x8, 2096437 runs, 715 skips
4424 dezicycles in vc1_inv_trans_4x8, 1047981 runs, 595 skips
3914 dezicycles in vc1_inv_trans_8x4, 1047851 runs, 725 skips
2512 dezicycles in vc1_inv_trans_4x4, 1047902 runs, 674 skips
2501 dezicycles in vc1_inv_trans_4x4, 2096179 runs, 973 skips
3896 dezicycles in vc1_inv_trans_8x4, 2095994 runs, 1158 skips
4412 dezicycles in vc1_inv_trans_4x8, 2096082 runs, 1070 skips
7142 dezicycles in vc1_inv_trans_8x8, 4193046 runs, 1258 skips
If you are wondering why the c version of the 8x8 transformation isn't
as fast as the mmx version because of my last post, I mistakenly
benchmarked the mmx version twice last time. I attached a patch which
adds the START/STOP_TIMER around the calls for someone else to check.
[...]
>
>> +#define SRA2(r0,r1,c0)\
>> + "psraw $"#c0", "#r0"\n\t" /* r0 >> c0 */\
>> + "psraw $"#c0", "#r1"\n\t" /* r1 >> c0 */
>> +
>> +/*
>> + postcondition:
>> + r0 = r0 >> c0;
>> + r1 = r1 >> c0;
>> + r2 = r2 >> c0;
>> + r3 = r3 >> c0;
>> +*/
>> +#define SRA4(r0,r1,r2,r3,c0)\
>> + SRA2(r0,r1,c0)\
>> + SRA2(r2,r3,c0)
>>
>
> I think a generic macro like:
>
> OPC_SS_AB(opc, dst0, dst1, src) \
> opc" "src","dst0"\n\t" \
> opc" "src","dst1"\n\t"
>
> OPC_SSSS_ABCD(opc, dst0, dst1, dst2, dst3, src) \
> OPC_SS_AB(opc, dst0, dst1, src) \
> OPC_SS_AB(opc, dst2, dst3, src) \
>
> would be simpler
>
>
> [...]
>
>> +DECLARE_ALIGNED_16(static const int16_t, constants[]) = {
>> + X4( 4),
>> + X4(64)
>> +};
>>
>
> Also please reduce the amount of marcos in your code this hurts readability a
> lot.
> That means at least every macro that is bigger than the space it safes.
> 4,4,4,4
> 64,64,64,64
>
> vs.
>
> X4( 4),
> X4(64)
> #define X4(x) x, x, x, x
>
> being an example
>
> [...]
>
hopefully all issues have been addressed.
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: dsputil.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080628/4b5806e8/attachment.asc>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: vc1_inv_trans_mmx_v3.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080628/4b5806e8/attachment.txt>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: vc1_timer.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080628/4b5806e8/attachment-0001.asc>
More information about the ffmpeg-devel
mailing list