[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations
Michael Niedermayer
michaelni
Sat Jun 21 20:13:03 CEST 2008
On Sat, Jun 21, 2008 at 03:37:44PM +0200, Victor Pollex wrote:
> Hi,
> as in subject.
>
> Victor Pollex
> Index: libavcodec/i386/vc1dsp_mmx.c
> ===================================================================
> --- libavcodec/i386/vc1dsp_mmx.c (Revision 13854)
> +++ libavcodec/i386/vc1dsp_mmx.c (Arbeitskopie)
> @@ -1,6 +1,7 @@
> /*
> * VC-1 and WMV3 - DSP functions MMX-optimized
> * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
> + * Copyright (c) 2008 Victor Pollex
> *
> * Permission is hereby granted, free of charge, to any person
> * obtaining a copy of this software and associated documentation
> @@ -467,7 +468,609 @@
> DECLARE_FUNCTION(3, 2)
> DECLARE_FUNCTION(3, 3)
>
> +#define LOAD_4X4(stride,base,in)\
> + "movq 0*"#stride"+"#base#in", %%mm0\n\t"\
> + "movq 1*"#stride"+"#base#in", %%mm1\n\t"\
> + "movq 2*"#stride"+"#base#in", %%mm2\n\t"\
> + "movq 3*"#stride"+"#base#in", %%mm3\n\t"
duplicate of LOAD4
> +
> +#define STORE_4X4(stride,base,out)\
> + "movq %%mm0, 0*"#stride"+"#base#out"\n\t"\
> + "movq %%mm1, 1*"#stride"+"#base#out"\n\t"\
> + "movq %%mm2, 2*"#stride"+"#base#out"\n\t"\
> + "movq %%mm3, 3*"#stride"+"#base#out"\n\t"
> +
duplicate of STORE4
> +/*
> + precondition:
> + r0 = row0/col0
> + r1 = row1/col1
> + r2 = row2/col2
> + r3 = row3/col3
> +
> + postcondition:
> + r0 = col0/row0
> + r1 = col1/row1
> + r2 = col2/row2
> + r3 = col3/row3
> + t0 = undefined
> +*/
> +#define TRANSPOSE_4X4(r0,r1,r2,r3,t0)\
> + "movq "#r2", "#t0"\n\t"\
> + "punpcklwd "#r3", "#r2"\n\t"\
> + "punpckhwd "#r3", "#t0"\n\t"\
> + \
> + "movq "#r0", "#r3"\n\t"\
> + "punpcklwd "#r1", "#r0"\n\t"\
> + "punpckhwd "#r1", "#r3"\n\t"\
> + \
> + "movq "#r0", "#r1"\n\t"\
> + "punpckldq "#r2", "#r0"\n\t"\
> + "punpckhdq "#r2", "#r1"\n\t"\
> + \
> + "movq "#r3", "#r2"\n\t"\
> + "punpckldq "#t0", "#r2"\n\t"\
> + "punpckhdq "#t0", "#r3"\n\t"
duplicate of TRANSPOSE4
> +
> +
> +/*
> + precodition:
> + -(2^15) <= r0 < 2^15
> + -(2^14) <= r1 < 2^14
> + -(2^15) <= r1 + r0 < 2^15
> + postcondition:
> + r0 = r1 + r0
> + r1 = r1 - r0
> +*/
> +#define TRANSFORM_COMMON_ADDSUB(r0,r1)\
> + "paddw "#r1", "#r0" \n\t"\
> + "psllw $1, "#r1" \n\t"\
> + "psubw "#r0", "#r1" \n\t"
> +
duplicate of SUMSUB_BA
> +/*
> + postcondition:
> + r0 = [15:0](r0 + r2);
> + r1 = [15:0](r1 - r2);
> +*/
> +#define TRANSFORM_COMMON_ADD1SUB1(r0,r1,r2)\
> + "paddw "#r2", "#r0"\n\t" /* r0 + r2 */\
> + "psubw "#r2", "#r1"\n\t" /* r1 - r2 */
"TRANSFORM_COMMON" says nothing about any of the macros it just
makes them longer
[...]
> +#define TRANSFORM_4X4_COMMON(r0,r1,r2,r3,r4,r5,r6,r7,c0)\
> + TRANSFORM_COMMON_ADDSUB(r2,r0)\
> + "movq "#r0", "#r5"\n\t" /* r0 - r2 */\
> + "movq "#r2", "#r7"\n\t" /* r0 + r2 */\
> + "pcmpeqw "#r4", "#r4"\n\t" /* -1 */\
> + "psllw $"#c0", "#r4"\n\t" /* -1 << c0 */\
c0 is a constant caluclating -1 << c0 at runtims is inefficient
> + "psubw "#r4", "#r5"\n\t" /* r0 - r2 + (1 << c0) */\
> + "psubw "#r4", "#r7"\n\t" /* r0 + r2 + (1 << c0) */\
> + TRANSFORM_COMMON_SRA2(r5,r7,1)\
> + "movq "#r1", "#r4"\n\t" /* r1 */\
> + "movq "#r3", "#r6"\n\t" /* r3 */\
> + \
> + "psllw $1, "#r1"\n\t" /* 2 * r1 */\
paddw is faster
[...]
> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
> +{
> + DECLARE_ALIGNED_16(int16_t, temp[64]);
> + asm volatile(
> + TRANSFORM_8X4_ROW(0x00, (%0), %1)
> + TRANSFORM_8X4_ROW(0x40, (%0), %1)
> +
> +
> + LOAD_4X4(0x10, 0x00, %1)
> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
> + STORE_4X4(0x10, 0x00, %1)
> + LOAD_4X4(0x10, 0x40, %1)
> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
> + STORE_4X4(0x10, 0x40, %1)
> + TRANSFORM_4X8_COL(0x00, %1, (%0))
> +
> + LOAD_4X4(0x10, 0x08, %1)
> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
> + STORE_4X4(0x10, 0x08, %1)
> + LOAD_4X4(0x10, 0x48, %1)
> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
> + STORE_4X4(0x10, 0x48, %1)
> + TRANSFORM_4X8_COL(0x08, %1, (%0))
> + : "+r"(block), "+m"(temp)
> + :
> + : "memory"
> + );
> +}
> +
> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> + DECLARE_ALIGNED_16(int16_t, temp[64]);
> + asm volatile(
> + TRANSFORM_8X4_ROW(0x00, (%0), %1)
> +
> + LOAD_4X4(0x10, 0x00, %1)
> + TRANSFORM_4X4_COL
> + STORE_4X4(0x10, 0x00, (%0))
> + LOAD_4X4(0x10, 0x08, %1)
> + TRANSFORM_4X4_COL
> + STORE_4X4(0x10, 0x08, (%0))
> +
> + "pxor %%mm7, %%mm7\n\t"
> + LOAD_4X4(0x08, 0x00, (%0))
> + LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
> + "add %3, %2\n\t"
> + LOAD_4X4(0x08, 0x20, (%0))
> + LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
> + : "+r"(block), "+m"(temp), "+r"(dest)
> + : "r"(linesize)
> + : "memory"
> + );
> +}
> +
> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> + DECLARE_ALIGNED_16(int16_t, temp[64]);
> + asm volatile(
> + LOAD_4X4(0x10, 0x00, (%0))
> + TRANSFORM_4X4_ROW
> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
> + STORE_4X4(0x10, 0x00, %1)
> + LOAD_4X4(0x10, 0x40, (%0))
> + TRANSFORM_4X4_ROW
> + TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
> + STORE_4X4(0x10, 0x40, %1)
> +
> + TRANSFORM_4X8_COL(0x00, %1, (%0))
> +
> + "pxor %%mm7, %%mm7\n\t"
> + LOAD_4X4(0x10, 0x00, (%0))
> + LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
> + "add %3, %2\n\t"
> + LOAD_4X4(0x10, 0x40, (%0))
> + LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
> + : "+r"(block), "+m"(temp), "+r"(dest)
> + : "r"(linesize)
> + : "memory"
> + );
> +}
> +
> +static void vc1_inv_trans_4x4_mmx(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> + asm volatile(
> + LOAD_4X4(0x10, 0x00, (%1))
> + TRANSFORM_4X4_ROW
> + TRANSFORM_4X4_COL
> + "pxor %%mm7, %%mm7\n\t"
> + LOAD_ADD_CLAMP_STORE_4X4(%0, %2)
> + : "+r"(dest)
> + : "r"(block), "r"(linesize)
> + : "memory"
> + );
> +}
I do not think that brute force duplicating and unrolling of all variants
is optimal. Also benchmarks are needed for C vs, your mmx vs mmx
code with no duplicated transforms
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
I hate to see young programmers poisoned by the kind of thinking
Ulrich Drepper puts forward since it is simply too narrow -- Roman Shaposhnik
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080621/7bb8f6b6/attachment.pgp>
More information about the ffmpeg-devel
mailing list