[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations
Victor Pollex
victor.pollex
Sun Jun 22 15:21:53 CEST 2008
Michael Niedermayer schrieb:
> On Sat, Jun 21, 2008 at 03:37:44PM +0200, Victor Pollex wrote:
>
>> Hi,
>> as in subject.
>>
>> Victor Pollex
>>
>
>
>> Index: libavcodec/i386/vc1dsp_mmx.c
>> ===================================================================
>> --- libavcodec/i386/vc1dsp_mmx.c (Revision 13854)
>> +++ libavcodec/i386/vc1dsp_mmx.c (Arbeitskopie)
>> @@ -1,6 +1,7 @@
>> /*
>> * VC-1 and WMV3 - DSP functions MMX-optimized
>> * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
>> + * Copyright (c) 2008 Victor Pollex
>> *
>> * Permission is hereby granted, free of charge, to any person
>> * obtaining a copy of this software and associated documentation
>> @@ -467,7 +468,609 @@
>> DECLARE_FUNCTION(3, 2)
>> DECLARE_FUNCTION(3, 3)
>>
>> +#define LOAD_4X4(stride,base,in)\
>> + "movq 0*"#stride"+"#base#in", %%mm0\n\t"\
>> + "movq 1*"#stride"+"#base#in", %%mm1\n\t"\
>> + "movq 2*"#stride"+"#base#in", %%mm2\n\t"\
>> + "movq 3*"#stride"+"#base#in", %%mm3\n\t"
>>
>
> duplicate of LOAD4
>
the only LOAD4 I found is in dsputilenc_mmx.c and has a fixed stride of
8, but I also need a stride of 16. If I missed something give me a hint.
>
>
>> +
>> +#define STORE_4X4(stride,base,out)\
>> + "movq %%mm0, 0*"#stride"+"#base#out"\n\t"\
>> + "movq %%mm1, 1*"#stride"+"#base#out"\n\t"\
>> + "movq %%mm2, 2*"#stride"+"#base#out"\n\t"\
>> + "movq %%mm3, 3*"#stride"+"#base#out"\n\t"
>> +
>>
>
> duplicate of STORE4
>
same as with LOAD4, except i only need a stride of 16.
>
>
>> +/*
>> + precondition:
>> + r0 = row0/col0
>> + r1 = row1/col1
>> + r2 = row2/col2
>> + r3 = row3/col3
>> +
>> + postcondition:
>> + r0 = col0/row0
>> + r1 = col1/row1
>> + r2 = col2/row2
>> + r3 = col3/row3
>> + t0 = undefined
>> +*/
>> +#define TRANSPOSE_4X4(r0,r1,r2,r3,t0)\
>> + "movq "#r2", "#t0"\n\t"\
>> + "punpcklwd "#r3", "#r2"\n\t"\
>> + "punpckhwd "#r3", "#t0"\n\t"\
>> + \
>> + "movq "#r0", "#r3"\n\t"\
>> + "punpcklwd "#r1", "#r0"\n\t"\
>> + "punpckhwd "#r1", "#r3"\n\t"\
>> + \
>> + "movq "#r0", "#r1"\n\t"\
>> + "punpckldq "#r2", "#r0"\n\t"\
>> + "punpckhdq "#r2", "#r1"\n\t"\
>> + \
>> + "movq "#r3", "#r2"\n\t"\
>> + "punpckldq "#t0", "#r2"\n\t"\
>> + "punpckhdq "#t0", "#r3"\n\t"
>>
>
> duplicate of TRANSPOSE4
>
basically yes, but here the output registers are the same and in the
same order as the input registers and with TRANSPOSE4 the input
registers are a, b, c, d and the output registers are a, d, t, c
according to the comments. If I feel like i could try to rewrite part of
the code to use TRANSPOSE4, but I rather try to rewrite it so that I
don't need to transpose it in the first place.
>
>
>> +
>> +
>> +/*
>> + precodition:
>> + -(2^15) <= r0 < 2^15
>> + -(2^14) <= r1 < 2^14
>> + -(2^15) <= r1 + r0 < 2^15
>> + postcondition:
>> + r0 = r1 + r0
>> + r1 = r1 - r0
>> +*/
>> +#define TRANSFORM_COMMON_ADDSUB(r0,r1)\
>> + "paddw "#r1", "#r0" \n\t"\
>> + "psllw $1, "#r1" \n\t"\
>> + "psubw "#r0", "#r1" \n\t"
>> +
>>
>
> duplicate of SUMSUB_BA
>
changed it.
>
>
>> +/*
>> + postcondition:
>> + r0 = [15:0](r0 + r2);
>> + r1 = [15:0](r1 - r2);
>> +*/
>> +#define TRANSFORM_COMMON_ADD1SUB1(r0,r1,r2)\
>> + "paddw "#r2", "#r0"\n\t" /* r0 + r2 */\
>> + "psubw "#r2", "#r1"\n\t" /* r1 - r2 */
>>
>
> "TRANSFORM_COMMON" says nothing about any of the macros it just
> makes them longer
>
removed it.
>
> [...]
>
>> +#define TRANSFORM_4X4_COMMON(r0,r1,r2,r3,r4,r5,r6,r7,c0)\
>> + TRANSFORM_COMMON_ADDSUB(r2,r0)\
>> + "movq "#r0", "#r5"\n\t" /* r0 - r2 */\
>> + "movq "#r2", "#r7"\n\t" /* r0 + r2 */\
>>
>
>
>> + "pcmpeqw "#r4", "#r4"\n\t" /* -1 */\
>> + "psllw $"#c0", "#r4"\n\t" /* -1 << c0 */\
>>
>
> c0 is a constant caluclating -1 << c0 at runtims is inefficient
>
>
made static consts which are loaded.
>
>> + "psubw "#r4", "#r5"\n\t" /* r0 - r2 + (1 << c0) */\
>> + "psubw "#r4", "#r7"\n\t" /* r0 + r2 + (1 << c0) */\
>> + TRANSFORM_COMMON_SRA2(r5,r7,1)\
>> + "movq "#r1", "#r4"\n\t" /* r1 */\
>> + "movq "#r3", "#r6"\n\t" /* r3 */\
>> + \
>>
>
>
>> + "psllw $1, "#r1"\n\t" /* 2 * r1 */\
>>
>
> paddw is faster
>
changed it.
>
> [...]
>
>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>> +{
>> + DECLARE_ALIGNED_16(int16_t, temp[64]);
>> + asm volatile(
>> + TRANSFORM_8X4_ROW(0x00, (%0), %1)
>> + TRANSFORM_8X4_ROW(0x40, (%0), %1)
>> +
>> +
>> + LOAD_4X4(0x10, 0x00, %1)
>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> + STORE_4X4(0x10, 0x00, %1)
>> + LOAD_4X4(0x10, 0x40, %1)
>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> + STORE_4X4(0x10, 0x40, %1)
>> + TRANSFORM_4X8_COL(0x00, %1, (%0))
>> +
>> + LOAD_4X4(0x10, 0x08, %1)
>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> + STORE_4X4(0x10, 0x08, %1)
>> + LOAD_4X4(0x10, 0x48, %1)
>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> + STORE_4X4(0x10, 0x48, %1)
>> + TRANSFORM_4X8_COL(0x08, %1, (%0))
>> + : "+r"(block), "+m"(temp)
>> + :
>> + : "memory"
>> + );
>> +}
>> +
>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> + DECLARE_ALIGNED_16(int16_t, temp[64]);
>> + asm volatile(
>> + TRANSFORM_8X4_ROW(0x00, (%0), %1)
>> +
>> + LOAD_4X4(0x10, 0x00, %1)
>> + TRANSFORM_4X4_COL
>> + STORE_4X4(0x10, 0x00, (%0))
>> + LOAD_4X4(0x10, 0x08, %1)
>> + TRANSFORM_4X4_COL
>> + STORE_4X4(0x10, 0x08, (%0))
>> +
>> + "pxor %%mm7, %%mm7\n\t"
>> + LOAD_4X4(0x08, 0x00, (%0))
>> + LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>> + "add %3, %2\n\t"
>> + LOAD_4X4(0x08, 0x20, (%0))
>> + LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>> + : "+r"(block), "+m"(temp), "+r"(dest)
>> + : "r"(linesize)
>> + : "memory"
>> + );
>> +}
>> +
>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> + DECLARE_ALIGNED_16(int16_t, temp[64]);
>> + asm volatile(
>> + LOAD_4X4(0x10, 0x00, (%0))
>> + TRANSFORM_4X4_ROW
>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> + STORE_4X4(0x10, 0x00, %1)
>> + LOAD_4X4(0x10, 0x40, (%0))
>> + TRANSFORM_4X4_ROW
>> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>> + STORE_4X4(0x10, 0x40, %1)
>> +
>> + TRANSFORM_4X8_COL(0x00, %1, (%0))
>> +
>> + "pxor %%mm7, %%mm7\n\t"
>> + LOAD_4X4(0x10, 0x00, (%0))
>> + LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>> + "add %3, %2\n\t"
>> + LOAD_4X4(0x10, 0x40, (%0))
>> + LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>> + : "+r"(block), "+m"(temp), "+r"(dest)
>> + : "r"(linesize)
>> + : "memory"
>> + );
>> +}
>> +
>> +static void vc1_inv_trans_4x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
>> +{
>> + asm volatile(
>> + LOAD_4X4(0x10, 0x00, (%1))
>> + TRANSFORM_4X4_ROW
>> + TRANSFORM_4X4_COL
>> + "pxor %%mm7, %%mm7\n\t"
>> + LOAD_ADD_CLAMP_STORE_4X4(%0, %2)
>> + : "+r"(dest)
>> + : "r"(block), "r"(linesize)
>> + : "memory"
>> + );
>> +}
>>
>
> I do not think that brute force duplicating and unrolling of all variants
> is optimal. Also benchmarks are needed for C vs, your mmx vs mmx
> code with no duplicated transforms
>
> [...]
>
what do you mean with duplicated transforms? Do you mean that for
example the 4x4 row transformation should be a method instead of a macro?
As for benchmarks I used START_TIMER and STOP_TIMER from libavutil with
mingw 4.3.0-tdm3
I don't know if it is an appropiate method, but i did it somewhat like this
for(i = 0; i < (1 << 19); ++i) {
memcpy(block1, block, 64 * sizeof(short));
START_TIMER
vc1_inv_trans_4x4_c(dest, 8, block1);
STOP_TIMER("vc1_inv_trans_4x4_c")
}
4057 dezicycles in vc1_inv_trans_4x4_c, 65519 runs, 17 skips
4048 dezicycles in vc1_inv_trans_4x4_c, 131038 runs, 34 skips
4050 dezicycles in vc1_inv_trans_4x4_c, 262080 runs, 64 skips
4048 dezicycles in vc1_inv_trans_4x4_c, 524167 runs, 121 skips
1917 dezicycles in vc1_inv_trans_4x4_mmx, 65529 runs, 7 skips
1917 dezicycles in vc1_inv_trans_4x4_mmx, 131049 runs, 23 skips
1915 dezicycles in vc1_inv_trans_4x4_mmx, 262104 runs, 40 skips
1915 dezicycles in vc1_inv_trans_4x4_mmx, 524199 runs, 89 skips
7676 dezicycles in vc1_inv_trans_4x8_c, 65503 runs, 33 skips
7654 dezicycles in vc1_inv_trans_4x8_c, 131011 runs, 61 skips
7586 dezicycles in vc1_inv_trans_4x8_c, 262017 runs, 127 skips
7570 dezicycles in vc1_inv_trans_4x8_c, 524031 runs, 257 skips
3882 dezicycles in vc1_inv_trans_4x8_mmx, 65523 runs, 13 skips
3881 dezicycles in vc1_inv_trans_4x8_mmx, 131044 runs, 28 skips
3881 dezicycles in vc1_inv_trans_4x8_mmx, 262091 runs, 53 skips
3880 dezicycles in vc1_inv_trans_4x8_mmx, 524172 runs, 116 skips
9561 dezicycles in vc1_inv_trans_8x4_c, 65500 runs, 36 skips
9545 dezicycles in vc1_inv_trans_8x4_c, 131004 runs, 68 skips
9554 dezicycles in vc1_inv_trans_8x4_c, 261992 runs, 152 skips
9540 dezicycles in vc1_inv_trans_8x4_c, 523987 runs, 301 skips
3789 dezicycles in vc1_inv_trans_8x4_mmx, 65517 runs, 19 skips
3787 dezicycles in vc1_inv_trans_8x4_mmx, 131035 runs, 37 skips
3786 dezicycles in vc1_inv_trans_8x4_mmx, 262077 runs, 67 skips
3786 dezicycles in vc1_inv_trans_8x4_mmx, 524161 runs, 127 skips
7377 dezicycles in vc1_inv_trans_8x8_c, 65506 runs, 30 skips
7377 dezicycles in vc1_inv_trans_8x8_c, 131009 runs, 63 skips
7377 dezicycles in vc1_inv_trans_8x8_c, 262014 runs, 130 skips
7377 dezicycles in vc1_inv_trans_8x8_c, 524041 runs, 247 skips
7360 dezicycles in vc1_inv_trans_8x8_mmx, 65503 runs, 33 skips
7376 dezicycles in vc1_inv_trans_8x8_mmx, 131006 runs, 66 skips
7382 dezicycles in vc1_inv_trans_8x8_mmx, 262016 runs, 128 skips
7387 dezicycles in vc1_inv_trans_8x8_mmx, 524047 runs, 241 skips
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: vc1_inv_trans_mmx_v2.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080622/663a7fa0/attachment.txt>
More information about the ffmpeg-devel
mailing list