[FFmpeg-devel] [PATCH] MMX/SSE2 qpel functions for RV40
Michael Niedermayer
michaelni
Thu Jan 8 00:06:52 CET 2009
On Mon, Jan 05, 2009 at 07:35:30PM +0100, Mathieu Velten wrote:
> 2009/1/5 Michael Niedermayer <michaelni at gmx.at>:
> > you can access constants like ff_pw_5 through MANGLE() thus bypassing
> > gccs register deallocator
>
> thanks, new patch attached based on the first version with MANGLE(ff_pw_5).
[...]
> + for(i=0; i<h; i++) {\
> + __asm__ volatile(\
> + "pxor %%mm7, %%mm7 \n\t"\
this can be done outside the loop, also the loop should be in asm not C
> + "movq "MANGLE(ff_pw_5)", %%mm6 \n\t" /* mm6 = ff_pw_5 */\
> + "movq -2(%0), %%mm0 \n\t"\
> + "movq 3(%0), %%mm3 \n\t"\
> + "movq %%mm0, %%mm2 \n\t"\
> + "movq %%mm3, %%mm1 \n\t"\
> + "punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = src[-2..1] */\
> + "punpcklbw %%mm7, %%mm3 \n\t" /* mm3 = src[3..6] */\
> + "punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = src[2..5] */\
> + "punpckhbw %%mm7, %%mm1 \n\t" /* mm1 = src[7..10] */\
> + "paddw %%mm3, %%mm0 \n\t"\
> + "paddw %%mm2, %%mm1 \n\t" /* mm0/mm1 = src[-2..5] + src[3..10] */\
> + "movd -1(%0), %%mm4 \n\t"\
> + "movd 6(%0), %%mm5 \n\t"\
> + "punpcklbw %%mm7, %%mm4 \n\t" /* mm4 = src[-1..2] */\
> + "punpcklbw %%mm7, %%mm5 \n\t" /* mm5 = src[6..9] */\
> + "paddw %%mm4, %%mm2 \n\t"\
> + "paddw %%mm5, %%mm3 \n\t"\
> + "pmullw %%mm6, %%mm2 \n\t"\
> + "pmullw %%mm6, %%mm3 \n\t" /* mm2/mm3 = (src[-1..6]+src[2..9]) * 5 */\
> + "movq %2, %%mm6 \n\t" /* mm6 = ff_pw_C1 */\
> + "movq 0(%0), %%mm4 \n\t"\
the 0 is unneeded
> + "movq %%mm4, %%mm5 \n\t"\
> + "punpcklbw %%mm7, %%mm4 \n\t" /* mm4 = src[0..3] */\
> + "punpckhbw %%mm7, %%mm5 \n\t" /* mm5 = src[4..7] */\
> + "pmullw %%mm6, %%mm4 \n\t"\
> + "pmullw %%mm6, %%mm5 \n\t" /* mm4/mm5 = src[0..7] * C1 */\
> + "movq %3, %%mm6 \n\t" /* mm6 = ff_pw_C2 */\
> + "paddw %%mm4, %%mm0 \n\t"\
> + "paddw %%mm5, %%mm1 \n\t" /* mm0/mm1 += src[0..7] * C1 */\
> + "movq 1(%0), %%mm4 \n\t"\
> + "movq %%mm4, %%mm5 \n\t"\
> + "punpcklbw %%mm7, %%mm4 \n\t" /* mm4 = src[1..4] */\
> + "punpckhbw %%mm7, %%mm5 \n\t" /* mm5 = src[5..8] */\
> + "pmullw %%mm6, %%mm4 \n\t"\
> + "pmullw %%mm6, %%mm5 \n\t" /* mm4/mm5 = src[1..8] * C2 */\
> + "movq %4, %%mm6 \n\t" /* mm6 = rnd_reg */\
> + "paddw %%mm4, %%mm0 \n\t"\
> + "paddw %%mm5, %%mm1 \n\t" /* mm0/mm1 += src[1..8] * C2 */\
> + "movd %5, %%mm5 \n\t" /* mm5 = SHIFT */\
> + "psubw %%mm2, %%mm0 \n\t"\
> + "psubw %%mm3, %%mm1 \n\t" /* mm0/mm1 -= (src[-1..6]+src[2..9]) * 5 */\
> + "paddw %%mm6, %%mm0 \n\t"\
> + "paddw %%mm6, %%mm1 \n\t" /* mm0/mm1 += rnd_reg */\
> + "psraw %%mm5, %%mm0 \n\t"\
> + "psraw %%mm5, %%mm1 \n\t" /* mm0/mm1 >>= SHIFT */\
i wonder if SHIFT would not be better as a constant ...
(no iam not saying change it, only if its faster and doesnt lead to too
mich larger object files)
> + "packuswb %%mm1, %%mm0 \n\t"\
> + OP(%%mm0, (%1),%%mm5, q)\
> + : "+a"(src), "+c"(dst)\
> + : "m"(*C1_reg), "m"(*C2_reg), "m"(*rnd_reg), "D"((x86_reg)SHIFT)\
> + : "memory"\
C1_reg, C2_reg, rnd_reg, SHIFT could be in a table, thus requireing only 1 reg
to access them.
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
I have often repented speaking, but never of holding my tongue.
-- Xenocrates
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090108/b275b106/attachment.pgp>
More information about the ffmpeg-devel
mailing list