[Mplayer-cvslog] CVS: main/libac3/mmx imdct_3dnow.c,1.7,1.8 srfft_3dnow.c,1.12,1.13 srfftp_3dnow.h,1.8,1.9
Nick Kurshev
nickols_k at users.sourceforge.net
Wed Jun 20 09:53:57 CEST 2001
Update of /cvsroot/mplayer/main/libac3/mmx
In directory usw-pr-cvs1:/tmp/cvs-serv10866/main/libac3/mmx
Modified Files:
imdct_3dnow.c srfft_3dnow.c srfftp_3dnow.h
Log Message:
Better 3dnow! optimization
Index: imdct_3dnow.c
===================================================================
RCS file: /cvsroot/mplayer/main/libac3/mmx/imdct_3dnow.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -r1.7 -r1.8
*** imdct_3dnow.c 2001/06/13 15:17:10 1.7
--- imdct_3dnow.c 2001/06/20 07:53:55 1.8
***************
*** 28,31 ****
--- 28,36 ----
**/
+ #include "mmx/srfftp_3dnow.h"
+
+ extern const i_cmplx_t x_plus_minus_3dnow;
+ extern const i_cmplx_t x_minus_plus_3dnow;
+
static void imdct_do_512(float data[],float delay[])
{
***************
*** 40,50 ****
__asm__ __volatile__ (
! "movl $1, %%eax\n\t"
! "movd %%eax, %%mm7\n\t"
! "negl %%eax\n\t"
! "movd %%eax, %%mm6\n\t"
! "punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! "pi2fd %%mm7, %%mm7\n\t"
! :::"eax","memory");
for( i=0; i < 128; i++) {
j = pm128[i];
--- 45,51 ----
__asm__ __volatile__ (
! "movq %0, %%mm7\n\t"
! ::"m"(x_plus_minus_3dnow)
! :"memory");
for( i=0; i < 128; i++) {
j = pm128[i];
***************
*** 67,74 ****
"pfpnacc %%mm2, %%mm0\n\t"
#else
! "pfmul %%mm7, %%mm0\n\t"
"pfacc %%mm2, %%mm0\n\t"
#endif
! "pfmul %%mm7, %%mm0\n\t"
"movq %%mm0, %0"
:"=m"(buf[i])
--- 68,75 ----
"pfpnacc %%mm2, %%mm0\n\t"
#else
! "pxor %%mm7, %%mm0\n\t"
"pfacc %%mm2, %%mm0\n\t"
#endif
! "pxor %%mm7, %%mm0\n\t"
"movq %%mm0, %0"
:"=m"(buf[i])
***************
*** 84,102 ****
// Post IFFT complex multiply plus IFFT complex conjugate
__asm__ __volatile__ (
! "movl $1, %%eax\n\t"
! "movd %%eax, %%mm7\n\t"
! "negl %%eax\n\t"
! "movd %%eax, %%mm6\n\t"
! #ifndef HAVE_3DNOWEX
! "punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! "punpckldq %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
! "pi2fd %%mm7, %%mm7\n\t"
! "pi2fd %%mm6, %%mm6\n\t"
! #else
! "punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! "pi2fd %%mm7, %%mm7\n\t"
! "pswapd %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
! #endif
! :::"eax","memory");
for (i=0; i < 128; i++) {
__asm__ __volatile__ (
--- 85,93 ----
// Post IFFT complex multiply plus IFFT complex conjugate
__asm__ __volatile__ (
! "movq %0, %%mm7\n\t"
! "movq %1, %%mm6\n\t"
! ::"m"(x_plus_minus_3dnow),
! "m"(x_minus_plus_3dnow)
! :"eax","memory");
for (i=0; i < 128; i++) {
__asm__ __volatile__ (
***************
*** 115,119 ****
"pfmul %%mm3, %%mm1\n\t"
#ifndef HAVE_3DNOWEX
! "pfmul %%mm7, %%mm0\n\t"
"pfacc %%mm1, %%mm0\n\t"
"movd %%mm0, 4%0\n\t"
--- 106,110 ----
"pfmul %%mm3, %%mm1\n\t"
#ifndef HAVE_3DNOWEX
! "pxor %%mm7, %%mm0\n\t"
"pfacc %%mm1, %%mm0\n\t"
"movd %%mm0, 4%0\n\t"
***************
*** 145,150 ****
"pfmul (%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/
"pfmul 512(%3), %%mm1\n\t"
! "pfmul %%mm6, %%mm0\n\t"/*data_ptr[128]=buf[128-i-1].im*window_ptr[128]+delay_ptr[128];*/
! "pfmul %%mm6, %%mm1\n\t"
"pfadd (%4), %%mm0\n\t"
"pfadd 512(%4), %%mm1\n\t"
--- 136,141 ----
"pfmul (%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/
"pfmul 512(%3), %%mm1\n\t"
! "pxor %%mm6, %%mm0\n\t"/*data_ptr[128]=buf[128-i-1].im*window_ptr[128]+delay_ptr[128];*/
! "pxor %%mm6, %%mm1\n\t"
"pfadd (%4), %%mm0\n\t"
"pfadd 512(%4), %%mm1\n\t"
***************
*** 182,187 ****
"pfmul %%mm3, %%mm0\n\t"
"pfmul %%mm4, %%mm1\n\t"
! "pfmul %%mm6, %%mm0\n\t"
! "pfmul %%mm7, %%mm1\n\t"
"movq %%mm0, (%0)\n\t"
"movq %%mm1, 512(%0)"
--- 173,178 ----
"pfmul %%mm3, %%mm0\n\t"
"pfmul %%mm4, %%mm1\n\t"
! "pxor %%mm6, %%mm0\n\t"
! "pxor %%mm7, %%mm1\n\t"
"movq %%mm0, (%0)\n\t"
"movq %%mm1, 512(%0)"
Index: srfft_3dnow.c
===================================================================
RCS file: /cvsroot/mplayer/main/libac3/mmx/srfft_3dnow.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -r1.12 -r1.13
*** srfft_3dnow.c 2001/06/09 08:50:48 1.12
--- srfft_3dnow.c 2001/06/20 07:53:55 1.13
***************
*** 32,35 ****
--- 32,39 ----
#include "mmx/srfftp_3dnow.h"
+ const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = { 0x00000000UL, 0x80000000UL };
+ const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = { 0x80000000UL, 0x00000000UL };
+ const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
+
static void fft_4(complex_t *x)
{
***************
*** 37,61 ****
/* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
*/
- __asm__ __volatile__ (
- "movl $1, %%eax\n\t"
- "movd %%eax, %%mm7\n\t"
- "negl %%eax\n\t"
- "movd %%eax, %%mm6\n\t"
- #ifndef HAVE_3DNOWEX
- "punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
- "punpckldq %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
- "pi2fd %%mm7, %%mm7\n\t"
- "pi2fd %%mm6, %%mm6\n\t"
- #else
- "punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
- "pi2fd %%mm7, %%mm7\n\t"
- "pswapd %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
- #endif
- :::"eax","memory");
__asm__ __volatile__(
! "movq 24(%0), %%mm3\n\t"
! "movq 8(%0), %%mm1\n\t"
! "pfmul %%mm7, %%mm3\n\t" /* mm3.re | -mm3.im */
! "pfmul %%mm6, %%mm1\n\t" /* -mm1.re | mm1.im */
"pfadd %%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */
"movq %%mm3, %%mm4\n\t" /* vi.re =-x[3].im + x[1].im; mm4 = vi */
--- 41,49 ----
/* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
*/
__asm__ __volatile__(
! "movq 24(%1), %%mm3\n\t"
! "movq 8(%1), %%mm1\n\t"
! "pxor %2, %%mm3\n\t" /* mm3.re | -mm3.im */
! "pxor %3, %%mm1\n\t" /* -mm1.re | mm1.im */
"pfadd %%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */
"movq %%mm3, %%mm4\n\t" /* vi.re =-x[3].im + x[1].im; mm4 = vi */
***************
*** 67,77 ****
"punpckldq %%mm5, %%mm4\n\t"
#endif
!
! "movq (%0), %%mm5\n\t" /* yb.re = x[0].re - x[2].re; */
! "movq (%0), %%mm6\n\t" /* yt.re = x[0].re + x[2].re; */
! "movq 24(%0), %%mm7\n\t" /* u.re = x[3].re + x[1].re; */
! "pfsub 16(%0), %%mm5\n\t" /* yb.im = x[0].im - x[2].im; mm5 = yb */
! "pfadd 16(%0), %%mm6\n\t" /* yt.im = x[0].im + x[2].im; mm6 = yt */
! "pfadd 8(%0), %%mm7\n\t" /* u.im = x[3].im + x[1].im; mm7 = u */
"movq %%mm6, %%mm0\n\t" /* x[0].re = yt.re + u.re; */
--- 55,64 ----
"punpckldq %%mm5, %%mm4\n\t"
#endif
! "movq (%1), %%mm5\n\t" /* yb.re = x[0].re - x[2].re; */
! "movq (%1), %%mm6\n\t" /* yt.re = x[0].re + x[2].re; */
! "movq 24(%1), %%mm7\n\t" /* u.re = x[3].re + x[1].re; */
! "pfsub 16(%1), %%mm5\n\t" /* yb.im = x[0].im - x[2].im; mm5 = yb */
! "pfadd 16(%1), %%mm6\n\t" /* yt.im = x[0].im + x[2].im; mm6 = yt */
! "pfadd 8(%1), %%mm7\n\t" /* u.im = x[3].im + x[1].im; mm7 = u */
"movq %%mm6, %%mm0\n\t" /* x[0].re = yt.re + u.re; */
***************
*** 87,142 ****
"movq %%mm5, 24(%0)" /* x[3].im = yb.im - vi.im; */
:"=r"(x)
! :"0"(x)
:"memory");
}
- #if 0
- /* is never called */
- static void fftu_4(complex_t *x)
- {
- /* delta_p = 1 here */
- /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{2*pi/4}
- */
-
- register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
-
- yt_r = x[0].re;
- yb_r = yt_r - x[2].re;
- yt_r += x[2].re;
- u_r = x[1].re;
- vi_i = x[3].re - u_r;
- u_r += x[3].re;
-
- u_i = x[1].im;
- vi_r = u_i - x[3].im;
- u_i += x[3].im;
-
- yt_i = yt_r;
- yt_i += u_r;
- x[0].re = yt_i;
- yt_r -= u_r;
- x[2].re = yt_r;
- yt_i = yb_r;
- yt_i += vi_r;
- x[3].re = yt_i;
- yb_r -= vi_r;
- x[1].re = yb_r;
-
- yt_i = x[0].im;
- yb_i = yt_i - x[2].im;
- yt_i += x[2].im;
-
- yt_r = yt_i;
- yt_r += u_i;
- x[0].im = yt_r;
- yt_i -= u_i;
- x[2].im = yt_i;
- yt_r = yb_i;
- yt_r += vi_i;
- x[3].im = yt_r;
- yb_i -= vi_i;
- x[1].im = yb_i;
- }
- #endif
static void fft_8(complex_t *x)
{
--- 74,83 ----
"movq %%mm5, 24(%0)" /* x[3].im = yb.im - vi.im; */
:"=r"(x)
! :"0"(x),
! "m"(x_plus_minus_3dnow),
! "m"(x_minus_plus_3dnow)
:"memory");
}
static void fft_8(complex_t *x)
{
***************
*** 211,230 ****
/* x[1] x[5] */
__asm__ __volatile__ (
! "movl $1, %%eax\n\t"
! "movd %%eax, %%mm7\n\t"
! "negl %%eax\n\t"
! "movd %%eax, %%mm6\n\t"
! #ifndef HAVE_3DNOWEX
! "punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! "punpckldq %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
! "pi2fd %%mm7, %%mm7\n\t"
! "pi2fd %%mm6, %%mm6\n\t"
! #else
! "punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! "pi2fd %%mm7, %%mm7\n\t"
! "pswapd %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
! #endif
! :::"eax","memory");
! __asm__ __volatile__ (
"movq %1, %%mm0\n\t"
"movq %2, %%mm1\n\t"
--- 152,157 ----
/* x[1] x[5] */
__asm__ __volatile__ (
! "movq %6, %%mm6\n\t"
! "movq %5, %%mm7\n\t"
"movq %1, %%mm0\n\t"
"movq %2, %%mm1\n\t"
***************
*** 238,242 ****
"punpckldq %%mm2,%%mm1\n\t"
#endif
! "pfmul %%mm7, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
#ifdef HAVE_3DNOWEX
--- 165,169 ----
"punpckldq %%mm2,%%mm1\n\t"
#endif
! "pxor %%mm7, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
#ifdef HAVE_3DNOWEX
***************
*** 247,254 ****
"punpckldq %%mm2,%%mm3\n\t"
#endif
! "pfmul %%mm6, %%mm3\n\t"
"pfadd %%mm3, %%mm0\n\t"
"movq %%mm0, %%mm1\n\t"
! "pfmul %%mm6, %%mm1\n\t"
"pfacc %%mm1, %%mm0\n\t"
"pfmul %4, %%mm0\n\t"
--- 174,181 ----
"punpckldq %%mm2,%%mm3\n\t"
#endif
! "pxor %%mm6, %%mm3\n\t"
"pfadd %%mm3, %%mm0\n\t"
"movq %%mm0, %%mm1\n\t"
! "pxor %%mm6, %%mm1\n\t"
"pfacc %%mm1, %%mm0\n\t"
"pfmul %4, %%mm0\n\t"
***************
*** 271,275 ****
"movq %%mm2, 8(%3)\n\t"
:"=m"(wB2)
! :"m"(wT1), "m"(wB1), "r"(x), "m"(HSQRT2_3DNOW)
:"memory");
--- 198,203 ----
"movq %%mm2, 8(%3)\n\t"
:"=m"(wB2)
! :"m"(wT1), "m"(wB1), "r"(x), "m"(HSQRT2_3DNOW),
! "m"(x_plus_minus_3dnow), "m"(x_minus_plus_3dnow)
:"memory");
***************
*** 285,293 ****
"punpckldq %3, %%mm1\n\t"
#endif
! "pfmul %%mm6, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movq %2, %%mm2\n\t"
"movq 56(%4), %%mm3\n\t"
! "pfmul %%mm7, %%mm3\n\t"
"pfadd %%mm3, %%mm2\n\t"
#ifdef HAVE_3DNOWEX
--- 213,221 ----
"punpckldq %3, %%mm1\n\t"
#endif
! "pxor %%mm6, %%mm1\n\t"
"pfadd %%mm1, %%mm0\n\t"
"movq %2, %%mm2\n\t"
"movq 56(%4), %%mm3\n\t"
! "pxor %%mm7, %%mm3\n\t"
"pfadd %%mm3, %%mm2\n\t"
#ifdef HAVE_3DNOWEX
***************
*** 302,311 ****
"movq %%mm3, %%mm4\n\t"
"movq %%mm0, %%mm1\n\t"
! "pfmul %%mm6, %%mm0\n\t"
"pfacc %%mm1, %%mm0\n\t"
"pfmul %5, %%mm0\n\t"
"movq %%mm0, %%mm1\n\t"
! "pfmul %%mm6, %%mm1\n\t"
! "pfmul %%mm7, %%mm0\n\t"
"pfadd %%mm1, %%mm3\n\t"
"pfadd %%mm0, %%mm4\n\t"
--- 230,239 ----
"movq %%mm3, %%mm4\n\t"
"movq %%mm0, %%mm1\n\t"
! "pxor %%mm6, %%mm0\n\t"
"pfacc %%mm1, %%mm0\n\t"
"pfmul %5, %%mm0\n\t"
"movq %%mm0, %%mm1\n\t"
! "pxor %%mm6, %%mm1\n\t"
! "pxor %%mm7, %%mm0\n\t"
"pfadd %%mm1, %%mm3\n\t"
"pfadd %%mm0, %%mm4\n\t"
***************
*** 316,439 ****
:"memory");
}
- #if 0
- /* is never called */
- static void fftu_8(complex_t *x)
- {
- /* delta_p = diag{1, sqrt(i)} here */
- /* this function computes x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{2*pi/8}
- */
- register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
-
- /* 2 F_2 on x(1:4:7) and x(3:4:7) */
- /* wB2 is weighted by i */
-
- wT1_r = x[1].re;
- wT1_i = x[1].im;
- wB1_r = x[3].re;
- wB1_i = x[3].im;
-
- /* 1 F_4 on x(0:2:7) */
- x[1] = x[2];
- x[2] = x[4];
- x[3] = x[6];
- fft_4(&x[0]);
-
-
- /* x[0] x[4] */
- wT2_r = x[5].re;
- wT2_r += x[7].re;
- wT2_r += wT1_r;
- wT2_r += wB1_r;
- wT2_i = wT2_r;
- wT2_r += x[0].re;
- wT2_i = x[0].re - wT2_i;
- x[0].re = wT2_r;
- x[4].re = wT2_i;
-
- wT2_i = x[5].im;
- wT2_i += x[7].im;
- wT2_i += wT1_i;
- wT2_i += wB1_i;
- wT2_r = wT2_i;
- wT2_r += x[0].im;
- wT2_i = x[0].im - wT2_i;
- x[0].im = wT2_r;
- x[4].im = wT2_i;
-
- /* x[2] x[6] */
- wT2_r = x[5].im;
- wT2_r -= x[7].im;
- wT2_r += wT1_i;
- wT2_r -= wB1_i;
- wT2_i = wT2_r;
- wT2_r += x[2].re;
- wT2_i = x[2].re - wT2_i;
- x[2].re = wT2_i;
- x[6].re = wT2_r;
- wT2_i = x[5].re;
- wT2_i -= x[7].re;
- wT2_i += wT1_r;
- wT2_i -= wB1_r;
- wT2_r = wT2_i;
- wT2_r += x[2].im;
- wT2_i = x[2].im - wT2_i;
- x[2].im = wT2_r;
- x[6].im = wT2_i;
-
-
- /* x[1] x[5] */
- wT2_r = wT1_r;
- wT2_r -= wB1_i;
- wT2_r -= x[5].re;
- wT2_r += x[7].im;
- wT2_i = wT1_i;
- wT2_i += wB1_r;
- wT2_i -= x[5].im;
- wT2_i -= x[7].re;
-
- wB2_r = wT2_r;
- wB2_r -= wT2_i;
- wT2_i += wT2_r;
- wB2_r *= HSQRT2;
- wT2_i *= HSQRT2;
- wT2_r = wB2_r;
- wB2_r += x[1].re;
- wT2_r = x[1].re - wT2_r;
-
- wB2_i = x[5].re;
- x[1].re = wB2_r;
- x[5].re = wT2_r;
-
- wT2_r = wT2_i;
- wT2_r += x[1].im;
- wT2_i = x[1].im - wT2_i;
- wB2_r = x[5].im;
- x[1].im = wT2_r;
- x[5].im = wT2_i;
-
- /* x[3] x[7] */
- wT1_r += wB1_i;
- wT1_i -= wB1_r;
- wB1_r = wB2_i + x[7].im;
- wB1_i = wB2_r - x[7].re;
- wT1_r -= wB1_r;
- wT1_i -= wB1_i;
- wB1_r = wT1_r - wT1_i;
- wB1_r *= HSQRT2;
- wT1_i += wT1_r;
- wT1_i *= HSQRT2;
- wB2_r = x[3].re;
- wB2_i = wB2_r + wT1_i;
- wB2_r -= wT1_i;
- x[3].re = wB2_r;
- x[7].re = wB2_i;
- wB2_i = x[3].im;
- wB2_r = wB2_i + wB1_r;
- wB2_i -= wB1_r;
- x[3].im = wB2_r;
- x[7].im = wB2_i;
- }
- #endif
void fft_asmb(int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3)
--- 244,248 ----
Index: srfftp_3dnow.h
===================================================================
RCS file: /cvsroot/mplayer/main/libac3/mmx/srfftp_3dnow.h,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -r1.8 -r1.9
*** srfftp_3dnow.h 2001/06/12 15:55:59 1.8
--- srfftp_3dnow.h 2001/06/20 07:53:55 1.9
***************
*** 34,63 ****
#define SRFFTP_3DNOW_H__
! static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
- #ifdef HAVE_3DNOWEX
#define TRANS_FILL_MM6_MM7_3DNOW()\
__asm__ __volatile__(\
! "movl $-1, %%eax\n\t"\
! "movd %%eax, %%mm7\n\t"\
! "negl %%eax\n\t"\
! "movd %%eax, %%mm6\n\t"\
! "punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
! "pi2fd %%mm7, %%mm7\n\t"\
! "pswapd %%mm7, %%mm6\n\t"/* 1.0 | -1.0 */\
! :::"eax","memory");
! #else
! #define TRANS_FILL_MM6_MM7_3DNOW()\
! __asm__ __volatile__(\
! "movl $-1, %%eax\n\t"\
! "movd %%eax, %%mm7\n\t"\
! "negl %%eax\n\t"\
! "movd %%eax, %%mm6\n\t"\
! "punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
! "punpckldq %%mm7, %%mm6\n\t" /* 1.0 | -1.0 */\
! "pi2fd %%mm7, %%mm7\n\t"\
! "pi2fd %%mm6, %%mm6\n\t"\
! :::"eax","memory");
! #endif
#ifdef HAVE_3DNOWEX
--- 34,49 ----
#define SRFFTP_3DNOW_H__
! typedef struct
! {
! unsigned long val[2];
! }i_cmplx_t;
#define TRANS_FILL_MM6_MM7_3DNOW()\
__asm__ __volatile__(\
! "movq %1, %%mm7\n\t"\
! "movq %0, %%mm6\n\t"\
! ::"m"(x_plus_minus_3dnow),\
! "m"(x_minus_plus_3dnow)\
! :"memory");
#ifdef HAVE_3DNOWEX
***************
*** 86,91 ****
"movq %%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\
"pfadd %%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\
! "pfmul %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\
! "pfmul %%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\
"pfadd %%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\
"movq %%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\
--- 72,77 ----
"movq %%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\
"pfadd %%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\
! "pxor %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\
! "pxor %%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\
"pfadd %%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\
"movq %%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\
***************
*** 113,128 ****
"movq %4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\
"movq %%mm0, %%mm1\n\t"\
! "pfmul %%mm7, %%mm1\n\t"\
"pfacc %%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\
"movq %5, %%mm1\n\t" /*a.re = wTB[6].im - wTB[6].re; */\
"movq %%mm1, %%mm2\n\t"\
! "pfmul %%mm7, %%mm1\n\t"\
"pfacc %%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re; mm1 = a*/\
"movq %%mm1, %%mm2\n\t"\
! "pfmul %%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\
"movq %%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\
"pfadd %%mm2, %%mm3\n\t"\
PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\
! "pfmul %%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\
"pfadd %%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\
"movq %8, %%mm2\n\t"\
--- 99,114 ----
"movq %4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\
"movq %%mm0, %%mm1\n\t"\
! "pxor %%mm7, %%mm1\n\t"\
"pfacc %%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\
"movq %5, %%mm1\n\t" /*a.re = wTB[6].im - wTB[6].re; */\
"movq %%mm1, %%mm2\n\t"\
! "pxor %%mm7, %%mm1\n\t"\
"pfacc %%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re; mm1 = a*/\
"movq %%mm1, %%mm2\n\t"\
! "pxor %%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\
"movq %%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\
"pfadd %%mm2, %%mm3\n\t"\
PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\
! "pxor %%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\
"pfadd %%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\
"movq %8, %%mm2\n\t"\
***************
*** 134,140 ****
"movq %%mm3, %%mm4\n\t"\
"pfadd %%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\
! "pfmul %%mm6, %%mm4\n\t"/*A6.re = a1.re + v.re;*/\
"pfsub %%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\
! "pfmul %%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\
"movq %%mm1, %0\n\t"\
"movq %%mm2, %1\n\t"\
--- 120,126 ----
"movq %%mm3, %%mm4\n\t"\
"pfadd %%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\
! "pxor %%mm6, %%mm4\n\t"/*A6.re = a1.re + v.re;*/\
"pfsub %%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\
! "pxor %%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\
"movq %%mm1, %0\n\t"\
"movq %%mm2, %1\n\t"\
***************
*** 160,164 ****
"pfmul %%mm0, %%mm5\n\t"/* mm5 = a.re | a.im */\
PSWAP_MM("%%mm5","%%mm3")\
! "pfmul %%mm7, %%mm5\n\t"\
"pfadd %%mm5, %%mm4\n\t"/* mm4 = u*/\
"movq %3, %%mm1\n\t"\
--- 146,150 ----
"pfmul %%mm0, %%mm5\n\t"/* mm5 = a.re | a.im */\
PSWAP_MM("%%mm5","%%mm3")\
! "pxor %%mm7, %%mm5\n\t"\
"pfadd %%mm5, %%mm4\n\t"/* mm4 = u*/\
"movq %3, %%mm1\n\t"\
***************
*** 172,178 ****
"movq %%mm4, %%mm5\n\t"\
"punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\
! "pfmul %%mm6, %%mm5\n\t"\
"movq %%mm2, %%mm3\n\t"\
! "pfmul %%mm7, %%mm3\n\t"\
"pfadd %%mm3, %%mm5\n\t"\
PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\
--- 158,164 ----
"movq %%mm4, %%mm5\n\t"\
"punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\
! "pxor %%mm6, %%mm5\n\t"\
"movq %%mm2, %%mm3\n\t"\
! "pxor %%mm7, %%mm3\n\t"\
"pfadd %%mm3, %%mm5\n\t"\
PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\
_______________________________________________
Mplayer-cvslog mailing list
Mplayer-cvslog at lists.sourceforge.net
http://lists.sourceforge.net/lists/listinfo/mplayer-cvslog
More information about the MPlayer-cvslog
mailing list