[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations
Michael Niedermayer
michaelni
Wed Jul 9 11:53:45 CEST 2008
On Mon, Jul 07, 2008 at 09:02:28PM +0200, Victor Pollex wrote:
> Michael Niedermayer schrieb:
>> On Thu, Jul 03, 2008 at 02:51:18PM +0200, Victor Pollex wrote:
[...]
>>> +/*
>>> + precodition:
>>> + for all values v in r0, r1, r2, r3: -3971 <= v <= 3971
>>> +
>>> + postcondition:
>>> + r3 = ((17 * (r0 + r2) + (22 * r1 + 10 * r3) + c) >> 3)
>>> + r4 = ((17 * (r0 - r2) - (10 * r1 - 22 * r3) + c) >> 3)
>>> + r1 = ((17 * (r0 - r2) + (10 * r1 - 22 * r3) + c) >> 3)
>>> + r2 = ((17 * (r0 + r2) - (22 * r1 + 10 * r3) + c) >> 3)
>>> + r0 undefined
>>> + r5 undefined
>>> + r6 undefined
>>> + r7 undefined
>>> +*/
>>> +#define TRANSFORM_4X4_ROW(r0,r1,r2,r3,r4,r5,r6,r7,c)\
>>> + TRANSPOSE4(r0,r1,r2,r3,r4)\
>>> + TRANSFORM_4X4_COMMON(r0,r3,r4,r2,r1,r5,r6,r7,c)\
>>> + "paddw "#r4", "#r4"\n\t" /* 2 * (r0 + r2) */\
>>> + SUMSUB_BA(r3,r4)\
>>> + "paddw "#r1", "#r3"\n\t"\
>>> + "paddw "#r7", "#r4"\n\t"\
>>> + "paddw "#r0", "#r0"\n\t" /* 2 * (r0 - r2) */\
>>> + SUMSUB_BA(r2,r0)\
>>> + "paddw "#r5", "#r0"\n\t"\
>>> + "paddw "#r6", "#r2"\n\t"\
>>> + TRANSPOSE4(r3,r0,r2,r4,r1)
>>>
>>
>> It should be possible to merge one transpose into the scantble (the
>> mpeg1/2/4
>> decoder does that too)
>>
>>
> I'm not sure if this should be done as I found the following lines in
> decode_sequence_header in vc1.c
> if (!v->res_fasttx)
> {
> v->s.dsp.vc1_inv_trans_8x8 = ff_simple_idct;
> v->s.dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
> v->s.dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
> v->s.dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;
> }
The used permutation should of course depend on the used idct
>
>> [...]
>>
>>> +/*
>>> + postcondition:
>>> + r0 = [15:0](2 * r0);
>>> + r1 = [15:0](3 * r0);
>>> +*/
>>> +#define G3X(r0,r1)\
>>> + "movq "#r0", "#r1"\n\t" /* r0 */\
>>> + "paddw "#r0", "#r0"\n\t" /* 2 * r0 */\
>>> + "paddw "#r0", "#r1"\n\t" /* 3 * r0 */
>>>
>>
>> 4 uses, saving 8 lines, macro with docs is 9 lines
>>
> removed docs
I would prefer if you would remove the macro, it would make the code easier
to understand and more flexible.
[...]
> Index: libavcodec/i386/vc1dsp_mmx.c
> ===================================================================
> --- libavcodec/i386/vc1dsp_mmx.c (Revision 14101)
> +++ libavcodec/i386/vc1dsp_mmx.c (Arbeitskopie)
> @@ -1,6 +1,7 @@
> /*
> * VC-1 and WMV3 - DSP functions MMX-optimized
> * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
> + * Copyright (c) 2008 Victor Pollex
> *
> * Permission is hereby granted, free of charge, to any person
> * obtaining a copy of this software and associated documentation
> @@ -467,7 +468,466 @@
> DECLARE_FUNCTION(3, 2)
> DECLARE_FUNCTION(3, 3)
>
> +#define OPC_SS_AB(opc, src, dst0, dst1)\
> + #opc" "#src","#dst0"\n\t"\
> + #opc" "#src","#dst1"\n\t"
> +
> +#define OPC_SSSS_ABCD(opc, src, dst0, dst1, dst2, dst3)\
> + OPC_SS_AB(opc, src, dst0, dst1)\
> + OPC_SS_AB(opc, src, dst2, dst3)
> +
> +#define ADD1SUB1(src, dst0, dst1)\
> + "paddw "#src", "#dst0"\n\t"\
> + "psubw "#src", "#dst1"\n\t"
using
"paddw src, dst0\n\t psubw src, dst1\n\t"
instead of yet another macro
will be 4 lines shorter and more readable
> +
> +/*
> + precodition:
> + for all values v in r0, r1, r2, r3: -2^15 <= 5 * v + c / 2 <= 2^15 - 1
> +
> + postcondition:
> + r0 = r0 - r2
> + r1 = 2 * r1 + r3
> + r2 = r0 + r2
> + r3 = 2 * r3 - r1
> + r4 = (((r0 + r2 + c) >> 1) + (3 * r1 + r3)) >> 2
> + r5 = (((r0 - r2 + c) >> 1) - (3 * r3 - r1)) >> 2
> + r6 = (((r0 - r2 + c) >> 1) + (3 * r3 - r1)) >> 2
> + r7 = (((r0 + r2 + c) >> 1) - (3 * r1 + r3)) >> 2
> +*/
> +#define TRANSFORM_4X4_COMMON(r0,r1,r2,r3,r4,r5,r6,r7,c)\
> + SUMSUB_BA(r2,r0)\
> + "movq "#r0", "#r5"\n\t" /* r0 - r2 */\
> + "movq "#r2", "#r7"\n\t" /* r0 + r2 */\
> + "movq "#c", "#r4"\n\t" /* c */\
> + "paddw "#r4", "#r5"\n\t" /* r0 - r2 + c */\
> + "paddw "#r4", "#r7"\n\t" /* r0 + r2 + c */\
r5= c c
r0-=r2 r0-r2
r2+=r2 2r2
r6=r2 2r2
r2+=r0 r2+r2
r5+=r0 r0-r2+c
r6+=r5 r0+r2+c
one instruction less
> + OPC_SS_AB(psraw,$1,r5,r7)\
> + "movq "#r1", "#r4"\n\t" /* r1 */\
> + "movq "#r3", "#r6"\n\t" /* r3 */\
> + "paddw "#r1", "#r1"\n\t" /* 2 * r1 */\
> + "paddw "#r6", "#r1"\n\t" /* 2 * r1 + r3 */\
> + "paddw "#r3", "#r3"\n\t" /* 2 * r3 */\
> + "psubw "#r4", "#r3"\n\t" /* 2 * r3 - r1 */\
> + "paddw "#r1", "#r4"\n\t" /* 3 * r1 + r3 */\
> + "paddw "#r3", "#r6"\n\t" /* 3 * r3 - r1 */\
r4=r3 r3
r3+=r1 r3+r1
r1+=r1 2r1
r3+=r1 r3+3r1
r4-=r1 r3-2r1
r1+=r1 2r3-4r1
r1+=r3 3r3-r1
one instruction less
[...]
> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
> +{
> + DECLARE_ALIGNED_16(int16_t, temp[64]);
> + asm volatile(
> + TRANSFORM_8X4_ROW(0x00(%0),0x00%1)
> + TRANSFORM_8X4_ROW(0x40(%0),0x40%1)
> +
> + LOAD4(0x10,0x00%1,%%mm0,%%mm1,%%mm2,%%mm3)
> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> + STORE4(0x10,0x00%1,%%mm0,%%mm3,%%mm4,%%mm2)
> + LOAD4(0x10,0x08%1,%%mm0,%%mm1,%%mm2,%%mm3)
> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> + STORE4(0x10,0x08%1,%%mm0,%%mm3,%%mm4,%%mm2)
> +
> + LOAD4(0x10,0x40%1,%%mm0,%%mm1,%%mm2,%%mm3)
> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> + STORE4(0x10,0x40%1,%%mm0,%%mm3,%%mm4,%%mm2)
> + LOAD4(0x10,0x48%1,%%mm0,%%mm1,%%mm2,%%mm3)
> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> + STORE4(0x10,0x48%1,%%mm0,%%mm3,%%mm4,%%mm2)
> +
> + TRANSFORM_4X8_COL(0x00%1,0x00(%0),%2)
> + TRANSFORM_4X8_COL(0x08%1,0x08(%0),%2)
> + :
> + : "r"(block), "m"(temp[0]), "m"(constants[4])
> + : "memory"
> + );
> +}
> +
> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> + DECLARE_ALIGNED_16(int16_t, temp[64]);
> + asm volatile(
> + TRANSFORM_8X4_ROW(0x00(%1),0x00%2)
> +
> + LOAD4(0x10,0x00%2,%%mm0,%%mm1,%%mm2,%%mm3)
> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> + TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
> + "pxor %%mm7, %%mm7\n\t"
> + LOAD_ADD_CLAMP_STORE_4X4(%4,%0,%%mm1,%%mm3,%%mm0,%%mm2,%%mm4)
> +
> + LOAD4(0x10,0x08%2,%%mm0,%%mm1,%%mm2,%%mm3)
> + TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> + TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%5)
> + "pxor %%mm7, %%mm7\n\t"
> + LOAD_ADD_CLAMP_STORE_4X4(%4,%3,%%mm1,%%mm3,%%mm0,%%mm2,%%mm4)
> + : "+r"(dest)
> + : "r"(block), "m"(temp[0]), "r"(dest+4), "r"(linesize), "m"(constants[4])
> + : "memory"
> + );
> +}
> +
> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> + DECLARE_ALIGNED_16(int16_t, temp[64]);
> + asm volatile(
> + LOAD4(0x10,0x00(%1),%%mm0,%%mm1,%%mm2,%%mm3)
> + TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
> + STORE4(0x10,0x00%2,%%mm3,%%mm4,%%mm1,%%mm2)
> + LOAD4(0x10,0x40(%1),%%mm0,%%mm1,%%mm2,%%mm3)
> + TRANSFORM_4X4_ROW(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4,%%mm5,%%mm6,%%mm7,%4)
> + STORE4(0x10,0x40%2,%%mm3,%%mm4,%%mm1,%%mm2)
> +
> + TRANSFORM_4X8_COL(0x00%2,0x00(%1),0x08+%4)
> +
> + "pxor %%mm7, %%mm7\n\t"
> + LOAD4(0x10,0x00(%1),%%mm0,%%mm1,%%mm2,%%mm3)
> + LOAD_ADD_CLAMP_STORE_4X4(%3,%0,%%mm4,%%mm0,%%mm1,%%mm2,%%mm3)
> + "add %3, %0\n\t"
> + LOAD4(0x10,0x40(%1),%%mm0,%%mm1,%%mm2,%%mm3)
> + LOAD_ADD_CLAMP_STORE_4X4(%3,%0,%%mm4,%%mm0,%%mm1,%%mm2,%%mm3)
> + : "+r"(dest)
> + : "r"(block), "m"(temp[0]), "r"(linesize), "m"(constants[0])
> + : "memory"
> + );
> +}
some of the load&store are avoidable
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Good people do not need laws to tell them to act responsibly, while bad
people will find a way around the laws. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080709/2f38554e/attachment.pgp>
More information about the ffmpeg-devel
mailing list