[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations
Victor Pollex
victor.pollex
Thu Jul 17 23:29:59 CEST 2008
Michael Niedermayer schrieb:
> On Mon, Jul 07, 2008 at 09:02:28PM +0200, Victor Pollex wrote:
>
>> Michael Niedermayer schrieb:
>>
>>> On Thu, Jul 03, 2008 at 02:51:18PM +0200, Victor Pollex wrote:
>>>
> [...]
>
>>>> +/*
>>>> + precodition:
>>>> + for all values v in r0, r1, r2, r3: -3971 <= v <= 3971
>>>> +
>>>> + postcondition:
>>>> + r3 = ((17 * (r0 + r2) + (22 * r1 + 10 * r3) + c) >> 3)
>>>> + r4 = ((17 * (r0 - r2) - (10 * r1 - 22 * r3) + c) >> 3)
>>>> + r1 = ((17 * (r0 - r2) + (10 * r1 - 22 * r3) + c) >> 3)
>>>> + r2 = ((17 * (r0 + r2) - (22 * r1 + 10 * r3) + c) >> 3)
>>>> + r0 undefined
>>>> + r5 undefined
>>>> + r6 undefined
>>>> + r7 undefined
>>>> +*/
>>>> +#define TRANSFORM_4X4_ROW(r0,r1,r2,r3,r4,r5,r6,r7,c)\
>>>> + TRANSPOSE4(r0,r1,r2,r3,r4)\
>>>> + TRANSFORM_4X4_COMMON(r0,r3,r4,r2,r1,r5,r6,r7,c)\
>>>> + "paddw "#r4", "#r4"\n\t" /* 2 * (r0 + r2) */\
>>>> + SUMSUB_BA(r3,r4)\
>>>> + "paddw "#r1", "#r3"\n\t"\
>>>> + "paddw "#r7", "#r4"\n\t"\
>>>> + "paddw "#r0", "#r0"\n\t" /* 2 * (r0 - r2) */\
>>>> + SUMSUB_BA(r2,r0)\
>>>> + "paddw "#r5", "#r0"\n\t"\
>>>> + "paddw "#r6", "#r2"\n\t"\
>>>> + TRANSPOSE4(r3,r0,r2,r4,r1)
>>>>
>>>>
>>> It should be possible to merge one transpose into the scantble (the
>>> mpeg1/2/4
>>> decoder does that too)
>>>
>>>
>>>
>> I'm not sure if this should be done as I found the following lines in
>> decode_sequence_header in vc1.c
>> if (!v->res_fasttx)
>> {
>> v->s.dsp.vc1_inv_trans_8x8 = ff_simple_idct;
>> v->s.dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
>> v->s.dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
>> v->s.dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;
>> }
>>
>
> The used permutation should of course depend on the used idct
>
ok, changed it. I could also try to do the same with the scantables for
the 8x4 and 4x8 transformations if desired.
>
>
>>> [...]
>>>
>>>
>>>> +/*
>>>> + postcondition:
>>>> + r0 = [15:0](2 * r0);
>>>> + r1 = [15:0](3 * r0);
>>>> +*/
>>>> +#define G3X(r0,r1)\
>>>> + "movq "#r0", "#r1"\n\t" /* r0 */\
>>>> + "paddw "#r0", "#r0"\n\t" /* 2 * r0 */\
>>>> + "paddw "#r0", "#r1"\n\t" /* 3 * r0 */
>>>>
>>>>
>>> 4 uses, saving 8 lines, macro with docs is 9 lines
>>>
>>>
>> removed docs
>>
>
> I would prefer if you would remove the macro, it would make the code easier
> to understand and more flexible.
>
removed it.
>
> [...]
>
>> Index: libavcodec/i386/vc1dsp_mmx.c
>> ===================================================================
>> --- libavcodec/i386/vc1dsp_mmx.c (Revision 14101)
>> +++ libavcodec/i386/vc1dsp_mmx.c (Arbeitskopie)
>> @@ -1,6 +1,7 @@
>> /*
>> * VC-1 and WMV3 - DSP functions MMX-optimized
>> * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
>> + * Copyright (c) 2008 Victor Pollex
>> *
>> * Permission is hereby granted, free of charge, to any person
>> * obtaining a copy of this software and associated documentation
>> @@ -467,7 +468,466 @@
>> DECLARE_FUNCTION(3, 2)
>> DECLARE_FUNCTION(3, 3)
>>
>> +#define OPC_SS_AB(opc, src, dst0, dst1)\
>> + #opc" "#src","#dst0"\n\t"\
>> + #opc" "#src","#dst1"\n\t"
>> +
>> +#define OPC_SSSS_ABCD(opc, src, dst0, dst1, dst2, dst3)\
>> + OPC_SS_AB(opc, src, dst0, dst1)\
>> + OPC_SS_AB(opc, src, dst2, dst3)
>> +
>>
>
>
>> +#define ADD1SUB1(src, dst0, dst1)\
>> + "paddw "#src", "#dst0"\n\t"\
>> + "psubw "#src", "#dst1"\n\t"
>>
>
> using
>
> "paddw src, dst0\n\t psubw src, dst1\n\t"
>
> instead of yet another macro
> will be 4 lines shorter and more readable
>
>
removed it. I used two separate lines and commented them instead of
doing it like you suggested.
>
>
>> +
>> +/*
>> + precodition:
>> + for all values v in r0, r1, r2, r3: -2^15 <= 5 * v + c / 2 <= 2^15 - 1
>> +
>> + postcondition:
>> + r0 = r0 - r2
>> + r1 = 2 * r1 + r3
>> + r2 = r0 + r2
>> + r3 = 2 * r3 - r1
>> + r4 = (((r0 + r2 + c) >> 1) + (3 * r1 + r3)) >> 2
>> + r5 = (((r0 - r2 + c) >> 1) - (3 * r3 - r1)) >> 2
>> + r6 = (((r0 - r2 + c) >> 1) + (3 * r3 - r1)) >> 2
>> + r7 = (((r0 + r2 + c) >> 1) - (3 * r1 + r3)) >> 2
>> +*/
>> +#define TRANSFORM_4X4_COMMON(r0,r1,r2,r3,r4,r5,r6,r7,c)\
>>
>
>
>> + SUMSUB_BA(r2,r0)\
>> + "movq "#r0", "#r5"\n\t" /* r0 - r2 */\
>> + "movq "#r2", "#r7"\n\t" /* r0 + r2 */\
>> + "movq "#c", "#r4"\n\t" /* c */\
>> + "paddw "#r4", "#r5"\n\t" /* r0 - r2 + c */\
>> + "paddw "#r4", "#r7"\n\t" /* r0 + r2 + c */\
>>
>
> r5= c c
> r0-=r2 r0-r2
> r2+=r2 2r2
> r6=r2 2r2
> r2+=r0 r2+r2
> r5+=r0 r0-r2+c
> r6+=r5 r0+r2+c
>
> one instruction less
>
>
did it a bit differently, also with 7 instructions.
>
>> + OPC_SS_AB(psraw,$1,r5,r7)\
>>
>
>
>> + "movq "#r1", "#r4"\n\t" /* r1 */\
>> + "movq "#r3", "#r6"\n\t" /* r3 */\
>> + "paddw "#r1", "#r1"\n\t" /* 2 * r1 */\
>> + "paddw "#r6", "#r1"\n\t" /* 2 * r1 + r3 */\
>> + "paddw "#r3", "#r3"\n\t" /* 2 * r3 */\
>> + "psubw "#r4", "#r3"\n\t" /* 2 * r3 - r1 */\
>> + "paddw "#r1", "#r4"\n\t" /* 3 * r1 + r3 */\
>> + "paddw "#r3", "#r6"\n\t" /* 3 * r3 - r1 */\
>>
>
> r4=r3 r3
> r3+=r1 r3+r1
> r1+=r1 2r1
> r3+=r1 r3+3r1
> r4-=r1 r3-2r1
> r1+=r1 2r3-4r1
> r1+=r3 3r3-r1
>
> one instruction less
>
>
>
unfortunately the 2r1+r3 and 2r3-r1 are needed later.
> [...]
>
>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>> +{
>> + DECLARE_ALIGNED_16(int16_t, temp[64]);
>> + asm volatile(
>> + TRANSFORM_8X4_ROW(0x00(%0),0x00%1)
>> + TRANSFORM_8X4_ROW(0x40(%0),0x40%1)
>> +
>> + LOAD4(0x10,0x00%1,%%mm0,%%mm1,%%mm2,%%mm3)
>> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> + STORE4(0x10,0x00%1,%%mm0,%%mm3,%%mm4,%%mm2)
>> + LOAD4(0x10,0x08%1,%%mm0,%%mm1,%%mm2,%%mm3)
>> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> + STORE4(0x10,0x08%1,%%mm0,%%mm3,%%mm4,%%mm2)
>> +
>> + LOAD4(0x10,0x40%1,%%mm0,%%mm1,%%mm2,%%mm3)
>> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> + STORE4(0x10,0x40%1,%%mm0,%%mm3,%%mm4,%%mm2)
>> + LOAD4(0x10,0x48%1,%%mm0,%%mm1,%%mm2,%%mm3)
>> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> + STORE4(0x10,0x48%1,%%mm0,%%mm3,%%mm4,%%mm2)
>> +
>> + TRANSFORM_4X8_COL(0x00%1,0x00(%0),%2)
>> + TRANSFORM_4X8_COL(0x08%1,0x08(%0),%2)
>> + :
>> + : "r"(block), "m"(temp[0]), "m"(constants[4])
>> + : "memory"
>> + );
>> +}
>> +
>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> + DECLARE_ALIGNED_16(int16_t, temp[64]);
>> + asm volatile(
>> + TRANSFORM_8X4_ROW(0x00(%1),0x00%2)
>> +
>> + LOAD4(0x10,0x00%2,%%mm0,%%mm1,%%mm2,%%mm3)
>> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> + TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
>> + "pxor %%mm7, %%mm7\n\t"
>> + LOAD_ADD_CLAMP_STORE_4X4(%4,%0,%%mm1,%%mm3,%%mm0,%%mm2,%%mm4)
>> +
>> + LOAD4(0x10,0x08%2,%%mm0,%%mm1,%%mm2,%%mm3)
>> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> + TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
>> + "pxor %%mm7, %%mm7\n\t"
>> + LOAD_ADD_CLAMP_STORE_4X4(%4,%3,%%mm1,%%mm3,%%mm0,%%mm2,%%mm4)
>> + : "+r"(dest)
>> + : "r"(block), "m"(temp[0]), "r"(dest+4), "r"(linesize), "m"(constants[4])
>> + : "memory"
>> + );
>> +}
>> +
>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> + DECLARE_ALIGNED_16(int16_t, temp[64]);
>> + asm volatile(
>> + LOAD4(0x10,0x00(%1),%%mm0,%%mm1,%%mm2,%%mm3)
>> + TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
>> + STORE4(0x10,0x00%2,%%mm3,%%mm4,%%mm1,%%mm2)
>> + LOAD4(0x10,0x40(%1),%%mm0,%%mm1,%%mm2,%%mm3)
>> + TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
>> + STORE4(0x10,0x40%2,%%mm3,%%mm4,%%mm1,%%mm2)
>> +
>> + TRANSFORM_4X8_COL(0x00%2,0x00(%1),0x08+%4)
>> +
>> + "pxor %%mm7, %%mm7\n\t"
>> + LOAD4(0x10,0x00(%1),%%mm0,%%mm1,%%mm2,%%mm3)
>> + LOAD_ADD_CLAMP_STORE_4X4(%3,%0,%%mm4,%%mm0,%%mm1,%%mm2,%%mm3)
>> + "add %3, %0\n\t"
>> + LOAD4(0x10,0x40(%1),%%mm0,%%mm1,%%mm2,%%mm3)
>> + LOAD_ADD_CLAMP_STORE_4X4(%3,%0,%%mm4,%%mm0,%%mm1,%%mm2,%%mm3)
>> + : "+r"(dest)
>> + : "r"(block), "m"(temp[0]), "r"(linesize), "m"(constants[0])
>> + : "memory"
>> + );
>> +}
>>
>
> some of the load&store are avoidable
>
>
The only thing I can think of is in the 4x8 transformation, where the
column transformation is done in two halfs. Instead of writing the lines
to "block" after each half, I could add them to "dest". Is this what you
had in mind?
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: vc1_inv_trans_mmx_v6.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080717/d11219f4/attachment.txt>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: vc1_zz_4x4.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080717/d11219f4/attachment.asc>
More information about the ffmpeg-devel
mailing list