[Mplayer-cvslog] CVS: main/libac3/mmx imdct_3dnow.c,1.7,1.8 srfft_3dnow.c,1.12,1.13 srfftp_3dnow.h,1.8,1.9

Nick Kurshev nickols_k at users.sourceforge.net
Wed Jun 20 09:53:57 CEST 2001


Update of /cvsroot/mplayer/main/libac3/mmx
In directory usw-pr-cvs1:/tmp/cvs-serv10866/main/libac3/mmx

Modified Files:
	imdct_3dnow.c srfft_3dnow.c srfftp_3dnow.h 
Log Message:
Better 3dnow! optimization

Index: imdct_3dnow.c
===================================================================
RCS file: /cvsroot/mplayer/main/libac3/mmx/imdct_3dnow.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -r1.7 -r1.8
*** imdct_3dnow.c	2001/06/13 15:17:10	1.7
--- imdct_3dnow.c	2001/06/20 07:53:55	1.8
***************
*** 28,31 ****
--- 28,36 ----
   **/
  
+ #include "mmx/srfftp_3dnow.h"
+ 
+ extern const i_cmplx_t x_plus_minus_3dnow;
+ extern const i_cmplx_t x_minus_plus_3dnow;
+ 
  static void imdct_do_512(float data[],float delay[])
  {
***************
*** 40,50 ****
  
    __asm__ __volatile__ (
! 	"movl $1, %%eax\n\t"
! 	"movd %%eax, %%mm7\n\t"
! 	"negl %%eax\n\t"
! 	"movd %%eax, %%mm6\n\t"
! 	"punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! 	"pi2fd %%mm7, %%mm7\n\t"
! 	:::"eax","memory");
  	for( i=0; i < 128; i++) {
  		j = pm128[i];
--- 45,51 ----
  
    __asm__ __volatile__ (
! 	"movq %0, %%mm7\n\t"
! 	::"m"(x_plus_minus_3dnow)
! 	:"memory");
  	for( i=0; i < 128; i++) {
  		j = pm128[i];
***************
*** 67,74 ****
  		"pfpnacc %%mm2, %%mm0\n\t"
  #else
! 		"pfmul	%%mm7, %%mm0\n\t"
  		"pfacc	%%mm2, %%mm0\n\t"
  #endif
! 		"pfmul	%%mm7, %%mm0\n\t"
  		"movq	%%mm0, %0"
  		:"=m"(buf[i])
--- 68,75 ----
  		"pfpnacc %%mm2, %%mm0\n\t"
  #else
! 		"pxor	%%mm7, %%mm0\n\t"
  		"pfacc	%%mm2, %%mm0\n\t"
  #endif
! 		"pxor	%%mm7, %%mm0\n\t"
  		"movq	%%mm0, %0"
  		:"=m"(buf[i])
***************
*** 84,102 ****
  // Post IFFT complex multiply  plus IFFT complex conjugate
    __asm__ __volatile__ (
! 	"movl $1, %%eax\n\t"
! 	"movd %%eax, %%mm7\n\t"
! 	"negl %%eax\n\t"
! 	"movd %%eax, %%mm6\n\t"
! #ifndef HAVE_3DNOWEX
! 	"punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! 	"punpckldq %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
! 	"pi2fd %%mm7, %%mm7\n\t"
! 	"pi2fd %%mm6, %%mm6\n\t"
! #else
! 	"punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! 	"pi2fd %%mm7, %%mm7\n\t"
! 	"pswapd %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
! #endif
! 	:::"eax","memory");
  	for (i=0; i < 128; i++) {
  	    __asm__ __volatile__ (
--- 85,93 ----
  // Post IFFT complex multiply  plus IFFT complex conjugate
    __asm__ __volatile__ (
! 	"movq %0, %%mm7\n\t"
! 	"movq %1, %%mm6\n\t"
! 	::"m"(x_plus_minus_3dnow),
! 	"m"(x_minus_plus_3dnow)
! 	:"eax","memory");
  	for (i=0; i < 128; i++) {
  	    __asm__ __volatile__ (
***************
*** 115,119 ****
  		"pfmul %%mm3, %%mm1\n\t"
  #ifndef HAVE_3DNOWEX
! 		"pfmul %%mm7, %%mm0\n\t"
  		"pfacc %%mm1, %%mm0\n\t"
  		"movd %%mm0, 4%0\n\t"
--- 106,110 ----
  		"pfmul %%mm3, %%mm1\n\t"
  #ifndef HAVE_3DNOWEX
! 		"pxor  %%mm7, %%mm0\n\t"
  		"pfacc %%mm1, %%mm0\n\t"
  		"movd %%mm0, 4%0\n\t"
***************
*** 145,150 ****
  		"pfmul	(%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/
  		"pfmul	512(%3), %%mm1\n\t"
! 		"pfmul	%%mm6, %%mm0\n\t"/*data_ptr[128]=buf[128-i-1].im*window_ptr[128]+delay_ptr[128];*/
! 		"pfmul	%%mm6, %%mm1\n\t"
  		"pfadd	(%4), %%mm0\n\t"
  		"pfadd	512(%4), %%mm1\n\t"
--- 136,141 ----
  		"pfmul	(%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/
  		"pfmul	512(%3), %%mm1\n\t"
! 		"pxor	%%mm6, %%mm0\n\t"/*data_ptr[128]=buf[128-i-1].im*window_ptr[128]+delay_ptr[128];*/
! 		"pxor	%%mm6, %%mm1\n\t"
  		"pfadd	(%4), %%mm0\n\t"
  		"pfadd	512(%4), %%mm1\n\t"
***************
*** 182,187 ****
  		"pfmul	%%mm3, %%mm0\n\t"
  		"pfmul	%%mm4, %%mm1\n\t"
! 		"pfmul	%%mm6, %%mm0\n\t"
! 		"pfmul	%%mm7, %%mm1\n\t"
  		"movq	%%mm0, (%0)\n\t"
  		"movq	%%mm1, 512(%0)"
--- 173,178 ----
  		"pfmul	%%mm3, %%mm0\n\t"
  		"pfmul	%%mm4, %%mm1\n\t"
! 		"pxor	%%mm6, %%mm0\n\t"
! 		"pxor	%%mm7, %%mm1\n\t"
  		"movq	%%mm0, (%0)\n\t"
  		"movq	%%mm1, 512(%0)"

Index: srfft_3dnow.c
===================================================================
RCS file: /cvsroot/mplayer/main/libac3/mmx/srfft_3dnow.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -r1.12 -r1.13
*** srfft_3dnow.c	2001/06/09 08:50:48	1.12
--- srfft_3dnow.c	2001/06/20 07:53:55	1.13
***************
*** 32,35 ****
--- 32,39 ----
  #include "mmx/srfftp_3dnow.h"
  
+ const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = { 0x00000000UL, 0x80000000UL }; 
+ const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = { 0x80000000UL, 0x00000000UL }; 
+ const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
+ 
  static void fft_4(complex_t *x)
  {
***************
*** 37,61 ****
    /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} 
     */
-   __asm__ __volatile__ (
- 	"movl $1, %%eax\n\t"
- 	"movd %%eax, %%mm7\n\t"
- 	"negl %%eax\n\t"
- 	"movd %%eax, %%mm6\n\t"
- #ifndef HAVE_3DNOWEX
- 	"punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
- 	"punpckldq %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
- 	"pi2fd %%mm7, %%mm7\n\t"
- 	"pi2fd %%mm6, %%mm6\n\t"
- #else
- 	"punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
- 	"pi2fd %%mm7, %%mm7\n\t"
- 	"pswapd %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
- #endif
- 	:::"eax","memory");
    __asm__ __volatile__(
! 	"movq	24(%0), %%mm3\n\t"
! 	"movq	8(%0), %%mm1\n\t"
! 	"pfmul	%%mm7, %%mm3\n\t" /* mm3.re | -mm3.im */
! 	"pfmul  %%mm6, %%mm1\n\t" /* -mm1.re | mm1.im */
  	"pfadd	%%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */
  	"movq	%%mm3, %%mm4\n\t" /* vi.re =-x[3].im + x[1].im; mm4 = vi */
--- 41,49 ----
    /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} 
     */
    __asm__ __volatile__(
! 	"movq	24(%1), %%mm3\n\t"
! 	"movq	8(%1), %%mm1\n\t"
! 	"pxor	%2, %%mm3\n\t" /* mm3.re | -mm3.im */
! 	"pxor   %3, %%mm1\n\t" /* -mm1.re | mm1.im */
  	"pfadd	%%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */
  	"movq	%%mm3, %%mm4\n\t" /* vi.re =-x[3].im + x[1].im; mm4 = vi */
***************
*** 67,77 ****
  	"punpckldq %%mm5, %%mm4\n\t"
  #endif
! 
! 	"movq	(%0), %%mm5\n\t" /* yb.re = x[0].re - x[2].re; */
! 	"movq	(%0), %%mm6\n\t" /* yt.re = x[0].re + x[2].re; */
! 	"movq	24(%0), %%mm7\n\t" /* u.re  = x[3].re + x[1].re; */
! 	"pfsub	16(%0), %%mm5\n\t" /* yb.im = x[0].im - x[2].im; mm5 = yb */
! 	"pfadd	16(%0), %%mm6\n\t" /* yt.im = x[0].im + x[2].im; mm6 = yt */
! 	"pfadd	8(%0), %%mm7\n\t" /* u.im  = x[3].im + x[1].im; mm7 = u */
  
  	"movq	%%mm6, %%mm0\n\t" /* x[0].re = yt.re + u.re; */
--- 55,64 ----
  	"punpckldq %%mm5, %%mm4\n\t"
  #endif
! 	"movq	(%1), %%mm5\n\t" /* yb.re = x[0].re - x[2].re; */
! 	"movq	(%1), %%mm6\n\t" /* yt.re = x[0].re + x[2].re; */
! 	"movq	24(%1), %%mm7\n\t" /* u.re  = x[3].re + x[1].re; */
! 	"pfsub	16(%1), %%mm5\n\t" /* yb.im = x[0].im - x[2].im; mm5 = yb */
! 	"pfadd	16(%1), %%mm6\n\t" /* yt.im = x[0].im + x[2].im; mm6 = yt */
! 	"pfadd	8(%1), %%mm7\n\t" /* u.im  = x[3].im + x[1].im; mm7 = u */
  
  	"movq	%%mm6, %%mm0\n\t" /* x[0].re = yt.re + u.re; */
***************
*** 87,142 ****
  	"movq	%%mm5, 24(%0)" /* x[3].im = yb.im - vi.im; */
  	:"=r"(x)
! 	:"0"(x)
  	:"memory");
  }
- #if 0
- /* is never called */
- static void fftu_4(complex_t *x)
- {
-   /* delta_p = 1 here */
-   /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{2*pi/4} 
-    */
- 
-   register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
-   
-   yt_r = x[0].re;
-   yb_r = yt_r - x[2].re;
-   yt_r += x[2].re;
  
-   u_r = x[1].re;
-   vi_i = x[3].re - u_r;
-   u_r += x[3].re;
-   
-   u_i = x[1].im;
-   vi_r = u_i - x[3].im;
-   u_i += x[3].im;
- 
-   yt_i = yt_r;
-   yt_i += u_r;
-   x[0].re = yt_i;
-   yt_r -= u_r;
-   x[2].re = yt_r;
-   yt_i = yb_r;
-   yt_i += vi_r;
-   x[3].re = yt_i;
-   yb_r -= vi_r;
-   x[1].re = yb_r;
- 
-   yt_i = x[0].im;
-   yb_i = yt_i - x[2].im;
-   yt_i += x[2].im;
- 
-   yt_r = yt_i;
-   yt_r += u_i;
-   x[0].im = yt_r;
-   yt_i -= u_i;
-   x[2].im = yt_i;
-   yt_r = yb_i;
-   yt_r += vi_i;
-   x[3].im = yt_r;
-   yb_i -= vi_i;
-   x[1].im = yb_i;
- }
- #endif
  static void fft_8(complex_t *x)
  {
--- 74,83 ----
  	"movq	%%mm5, 24(%0)" /* x[3].im = yb.im - vi.im; */
  	:"=r"(x)
! 	:"0"(x),
! 	 "m"(x_plus_minus_3dnow),
! 	 "m"(x_minus_plus_3dnow)
  	:"memory");
  }
  
  static void fft_8(complex_t *x)
  {
***************
*** 211,230 ****
    /* x[1] x[5] */
    __asm__ __volatile__ (
! 	"movl $1, %%eax\n\t"
! 	"movd %%eax, %%mm7\n\t"
! 	"negl %%eax\n\t"
! 	"movd %%eax, %%mm6\n\t"
! #ifndef HAVE_3DNOWEX
! 	"punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! 	"punpckldq %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
! 	"pi2fd %%mm7, %%mm7\n\t"
! 	"pi2fd %%mm6, %%mm6\n\t"
! #else
! 	"punpckldq %%mm6, %%mm7\n\t" /* 1.0 | -1.0 */
! 	"pi2fd %%mm7, %%mm7\n\t"
! 	"pswapd %%mm7, %%mm6\n\t" /* -1.0 | 1.0 */
! #endif
! 	:::"eax","memory");
!   __asm__ __volatile__ (
  	"movq	%1,	%%mm0\n\t"
  	"movq	%2,	%%mm1\n\t"
--- 152,157 ----
    /* x[1] x[5] */
    __asm__ __volatile__ (
! 	"movq	%6,	%%mm6\n\t"
! 	"movq	%5,	%%mm7\n\t"
  	"movq	%1,	%%mm0\n\t"
  	"movq	%2,	%%mm1\n\t"
***************
*** 238,242 ****
  	"punpckldq %%mm2,%%mm1\n\t"
  #endif
! 	"pfmul	%%mm7,	%%mm1\n\t"
  	"pfadd	%%mm1,	%%mm0\n\t"
  #ifdef HAVE_3DNOWEX
--- 165,169 ----
  	"punpckldq %%mm2,%%mm1\n\t"
  #endif
! 	"pxor	%%mm7,	%%mm1\n\t"
  	"pfadd	%%mm1,	%%mm0\n\t"
  #ifdef HAVE_3DNOWEX
***************
*** 247,254 ****
  	"punpckldq %%mm2,%%mm3\n\t"
  #endif
! 	"pfmul	%%mm6,	%%mm3\n\t"
  	"pfadd	%%mm3,	%%mm0\n\t"
  	"movq	%%mm0,	%%mm1\n\t"
! 	"pfmul	%%mm6,	%%mm1\n\t"
  	"pfacc	%%mm1,	%%mm0\n\t"
  	"pfmul	%4,	%%mm0\n\t"
--- 174,181 ----
  	"punpckldq %%mm2,%%mm3\n\t"
  #endif
! 	"pxor	%%mm6,	%%mm3\n\t"
  	"pfadd	%%mm3,	%%mm0\n\t"
  	"movq	%%mm0,	%%mm1\n\t"
! 	"pxor	%%mm6,	%%mm1\n\t"
  	"pfacc	%%mm1,	%%mm0\n\t"
  	"pfmul	%4,	%%mm0\n\t"
***************
*** 271,275 ****
  	"movq	%%mm2,	8(%3)\n\t"
  	:"=m"(wB2)
! 	:"m"(wT1), "m"(wB1), "r"(x), "m"(HSQRT2_3DNOW)
  	:"memory");
  
--- 198,203 ----
  	"movq	%%mm2,	8(%3)\n\t"
  	:"=m"(wB2)
! 	:"m"(wT1), "m"(wB1), "r"(x), "m"(HSQRT2_3DNOW), 
! 	 "m"(x_plus_minus_3dnow), "m"(x_minus_plus_3dnow)
  	:"memory");
  
***************
*** 285,293 ****
  	"punpckldq %3,	%%mm1\n\t"
  #endif
! 	"pfmul	%%mm6,	%%mm1\n\t"	
  	"pfadd	%%mm1,	%%mm0\n\t"
  	"movq	%2,	%%mm2\n\t"
  	"movq	56(%4),	%%mm3\n\t"
! 	"pfmul	%%mm7,	%%mm3\n\t"
  	"pfadd	%%mm3,	%%mm2\n\t"
  #ifdef HAVE_3DNOWEX
--- 213,221 ----
  	"punpckldq %3,	%%mm1\n\t"
  #endif
! 	"pxor	%%mm6,	%%mm1\n\t"	
  	"pfadd	%%mm1,	%%mm0\n\t"
  	"movq	%2,	%%mm2\n\t"
  	"movq	56(%4),	%%mm3\n\t"
! 	"pxor	%%mm7,	%%mm3\n\t"
  	"pfadd	%%mm3,	%%mm2\n\t"
  #ifdef HAVE_3DNOWEX
***************
*** 302,311 ****
  	"movq	%%mm3,	%%mm4\n\t"
  	"movq	%%mm0,	%%mm1\n\t"
! 	"pfmul  %%mm6,	%%mm0\n\t"
  	"pfacc	%%mm1,	%%mm0\n\t"
  	"pfmul	%5,	%%mm0\n\t"
  	"movq	%%mm0,	%%mm1\n\t"
! 	"pfmul	%%mm6,	%%mm1\n\t"
! 	"pfmul	%%mm7,	%%mm0\n\t"
  	"pfadd	%%mm1,	%%mm3\n\t"
  	"pfadd	%%mm0,	%%mm4\n\t"
--- 230,239 ----
  	"movq	%%mm3,	%%mm4\n\t"
  	"movq	%%mm0,	%%mm1\n\t"
! 	"pxor	%%mm6,	%%mm0\n\t"
  	"pfacc	%%mm1,	%%mm0\n\t"
  	"pfmul	%5,	%%mm0\n\t"
  	"movq	%%mm0,	%%mm1\n\t"
! 	"pxor	%%mm6,	%%mm1\n\t"
! 	"pxor	%%mm7,	%%mm0\n\t"
  	"pfadd	%%mm1,	%%mm3\n\t"
  	"pfadd	%%mm0,	%%mm4\n\t"
***************
*** 316,439 ****
  	:"memory");
  }
- #if 0
- /* is never called */
- static void fftu_8(complex_t *x)
- {
-   /* delta_p = diag{1, sqrt(i)} here */
-   /* this function computes x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{2*pi/8} 
-    */
-   register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
-   
-   /* 2 F_2 on x(1:4:7) and x(3:4:7) */
-   /* wB2 is weighted by i */
- 
-   wT1_r = x[1].re;
-   wT1_i = x[1].im;
-   wB1_r = x[3].re;
-   wB1_i = x[3].im;
- 
-   /* 1 F_4 on x(0:2:7) */
-   x[1] = x[2];
-   x[2] = x[4];
-   x[3] = x[6];
-   fft_4(&x[0]);
- 
-   
-   /* x[0] x[4] */
-   wT2_r = x[5].re;
-   wT2_r += x[7].re;
-   wT2_r += wT1_r;
-   wT2_r += wB1_r;
-   wT2_i = wT2_r;
-   wT2_r += x[0].re;
-   wT2_i = x[0].re - wT2_i;
-   x[0].re = wT2_r;
-   x[4].re = wT2_i;
- 
-   wT2_i = x[5].im;
-   wT2_i += x[7].im;
-   wT2_i += wT1_i;
-   wT2_i += wB1_i;
-   wT2_r = wT2_i;
-   wT2_r += x[0].im;
-   wT2_i = x[0].im - wT2_i;
-   x[0].im = wT2_r;
-   x[4].im = wT2_i;
-   
-   /* x[2] x[6] */
-   wT2_r = x[5].im;
-   wT2_r -= x[7].im;
-   wT2_r += wT1_i;
-   wT2_r -= wB1_i;
-   wT2_i = wT2_r;
-   wT2_r += x[2].re;
-   wT2_i = x[2].re - wT2_i;
-   x[2].re = wT2_i;
-   x[6].re = wT2_r;
  
-   wT2_i = x[5].re;
-   wT2_i -= x[7].re;
-   wT2_i += wT1_r;
-   wT2_i -= wB1_r;
-   wT2_r = wT2_i;
-   wT2_r += x[2].im;
-   wT2_i = x[2].im - wT2_i;
-   x[2].im = wT2_r;
-   x[6].im = wT2_i;
-   
- 
-   /* x[1] x[5] */
-   wT2_r = wT1_r;
-   wT2_r -= wB1_i;
-   wT2_r -= x[5].re;
-   wT2_r += x[7].im;
-   wT2_i = wT1_i;
-   wT2_i += wB1_r;
-   wT2_i -= x[5].im;
-   wT2_i -= x[7].re;
- 
-   wB2_r = wT2_r;
-   wB2_r -= wT2_i;
-   wT2_i += wT2_r;
-   wB2_r *= HSQRT2;
-   wT2_i *= HSQRT2;
-   wT2_r = wB2_r;
-   wB2_r += x[1].re;
-   wT2_r =  x[1].re - wT2_r;
- 
-   wB2_i = x[5].re;
-   x[1].re = wB2_r;
-   x[5].re = wT2_r;
- 
-   wT2_r = wT2_i;
-   wT2_r += x[1].im;
-   wT2_i = x[1].im - wT2_i;
-   wB2_r = x[5].im;
-   x[1].im = wT2_r;
-   x[5].im = wT2_i;
- 
-   /* x[3] x[7] */
-   wT1_r += wB1_i;
-   wT1_i -= wB1_r;
-   wB1_r = wB2_i + x[7].im;
-   wB1_i = wB2_r - x[7].re;
-   wT1_r -= wB1_r;
-   wT1_i -= wB1_i;
-   wB1_r = wT1_r - wT1_i;
-   wB1_r *= HSQRT2;
-   wT1_i += wT1_r;
-   wT1_i *= HSQRT2;
-   wB2_r = x[3].re;
-   wB2_i = wB2_r + wT1_i;
-   wB2_r -= wT1_i;
-   x[3].re = wB2_r;
-   x[7].re = wB2_i;
-   wB2_i = x[3].im;
-   wB2_r = wB2_i + wB1_r;
-   wB2_i -= wB1_r;
-   x[3].im = wB2_r;
-   x[7].im = wB2_i;
- }
- #endif
  void fft_asmb(int k, complex_t *x, complex_t *wTB,
  		     const complex_t *d, const complex_t *d_3)
--- 244,248 ----

Index: srfftp_3dnow.h
===================================================================
RCS file: /cvsroot/mplayer/main/libac3/mmx/srfftp_3dnow.h,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -r1.8 -r1.9
*** srfftp_3dnow.h	2001/06/12 15:55:59	1.8
--- srfftp_3dnow.h	2001/06/20 07:53:55	1.9
***************
*** 34,63 ****
  #define SRFFTP_3DNOW_H__
  
! static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
  
- #ifdef HAVE_3DNOWEX
  #define TRANS_FILL_MM6_MM7_3DNOW()\
      __asm__ __volatile__(\
! 	"movl	$-1, %%eax\n\t"\
! 	"movd	%%eax, %%mm7\n\t"\
! 	"negl	%%eax\n\t"\
! 	"movd	%%eax, %%mm6\n\t"\
! 	"punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
! 	"pi2fd	%%mm7, %%mm7\n\t"\
! 	"pswapd	%%mm7, %%mm6\n\t"/* 1.0 | -1.0 */\
! 	:::"eax","memory");
! #else
! #define TRANS_FILL_MM6_MM7_3DNOW()\
!     __asm__ __volatile__(\
! 	"movl	$-1, %%eax\n\t"\
! 	"movd	%%eax, %%mm7\n\t"\
! 	"negl	%%eax\n\t"\
! 	"movd	%%eax, %%mm6\n\t"\
! 	"punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
! 	"punpckldq %%mm7, %%mm6\n\t" /* 1.0 | -1.0 */\
! 	"pi2fd	%%mm7, %%mm7\n\t"\
! 	"pi2fd	%%mm6, %%mm6\n\t"\
! 	:::"eax","memory");
! #endif
  
  #ifdef HAVE_3DNOWEX
--- 34,49 ----
  #define SRFFTP_3DNOW_H__
  
! typedef struct
! {
!   unsigned long val[2];
! }i_cmplx_t;
  
  #define TRANS_FILL_MM6_MM7_3DNOW()\
      __asm__ __volatile__(\
! 	"movq	%1, %%mm7\n\t"\
! 	"movq	%0, %%mm6\n\t"\
! 	::"m"(x_plus_minus_3dnow),\
! 	"m"(x_minus_plus_3dnow)\
! 	:"memory");
  
  #ifdef HAVE_3DNOWEX
***************
*** 86,91 ****
  	"movq	%%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\
  	"pfadd	%%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\
! 	"pfmul  %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\
! 	"pfmul	%%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\
  	"pfadd	%%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\
  	"movq	%%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\
--- 72,77 ----
  	"movq	%%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\
  	"pfadd	%%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\
! 	"pxor	%%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\
! 	"pxor	%%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\
  	"pfadd	%%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\
  	"movq	%%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\
***************
*** 113,128 ****
  	"movq	%4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\
  	"movq	%%mm0, %%mm1\n\t"\
! 	"pfmul	%%mm7, %%mm1\n\t"\
  	"pfacc	%%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\
  	"movq	%5, %%mm1\n\t"  /*a.re = wTB[6].im - wTB[6].re; */\
  	"movq	%%mm1, %%mm2\n\t"\
! 	"pfmul	%%mm7, %%mm1\n\t"\
  	"pfacc	%%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re;  mm1 = a*/\
  	"movq	%%mm1, %%mm2\n\t"\
! 	"pfmul	%%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\
  	"movq	%%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\
  	"pfadd	%%mm2, %%mm3\n\t"\
  	PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\
! 	"pfmul	%%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\
  	"pfadd	%%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\
  	"movq	%8, %%mm2\n\t"\
--- 99,114 ----
  	"movq	%4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\
  	"movq	%%mm0, %%mm1\n\t"\
! 	"pxor	%%mm7, %%mm1\n\t"\
  	"pfacc	%%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\
  	"movq	%5, %%mm1\n\t"  /*a.re = wTB[6].im - wTB[6].re; */\
  	"movq	%%mm1, %%mm2\n\t"\
! 	"pxor	%%mm7, %%mm1\n\t"\
  	"pfacc	%%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re;  mm1 = a*/\
  	"movq	%%mm1, %%mm2\n\t"\
! 	"pxor	%%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\
  	"movq	%%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\
  	"pfadd	%%mm2, %%mm3\n\t"\
  	PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\
! 	"pxor	%%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\
  	"pfadd	%%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\
  	"movq	%8, %%mm2\n\t"\
***************
*** 134,140 ****
  	"movq	%%mm3, %%mm4\n\t"\
  	"pfadd	%%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\
! 	"pfmul	%%mm6, %%mm4\n\t"/*A6.re  = a1.re + v.re;*/\
  	"pfsub	%%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\
! 	"pfmul	%%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\
  	"movq	%%mm1, %0\n\t"\
  	"movq	%%mm2, %1\n\t"\
--- 120,126 ----
  	"movq	%%mm3, %%mm4\n\t"\
  	"pfadd	%%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\
! 	"pxor	%%mm6, %%mm4\n\t"/*A6.re  = a1.re + v.re;*/\
  	"pfsub	%%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\
! 	"pxor	%%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\
  	"movq	%%mm1, %0\n\t"\
  	"movq	%%mm2, %1\n\t"\
***************
*** 160,164 ****
  	"pfmul	%%mm0,	%%mm5\n\t"/* mm5 = a.re | a.im */\
  	PSWAP_MM("%%mm5","%%mm3")\
! 	"pfmul	%%mm7,	%%mm5\n\t"\
  	"pfadd	%%mm5,	%%mm4\n\t"/* mm4 = u*/\
  	"movq	%3,	%%mm1\n\t"\
--- 146,150 ----
  	"pfmul	%%mm0,	%%mm5\n\t"/* mm5 = a.re | a.im */\
  	PSWAP_MM("%%mm5","%%mm3")\
! 	"pxor	%%mm7,	%%mm5\n\t"\
  	"pfadd	%%mm5,	%%mm4\n\t"/* mm4 = u*/\
  	"movq	%3,	%%mm1\n\t"\
***************
*** 172,178 ****
  	"movq	%%mm4,	%%mm5\n\t"\
  	"punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\
! 	"pfmul	%%mm6,	%%mm5\n\t"\
  	"movq	%%mm2,	%%mm3\n\t"\
! 	"pfmul	%%mm7,	%%mm3\n\t"\
  	"pfadd	%%mm3,	%%mm5\n\t"\
  	PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\
--- 158,164 ----
  	"movq	%%mm4,	%%mm5\n\t"\
  	"punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\
! 	"pxor	%%mm6,	%%mm5\n\t"\
  	"movq	%%mm2,	%%mm3\n\t"\
! 	"pxor	%%mm7,	%%mm3\n\t"\
  	"pfadd	%%mm3,	%%mm5\n\t"\
  	PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\


_______________________________________________
Mplayer-cvslog mailing list
Mplayer-cvslog at lists.sourceforge.net
http://lists.sourceforge.net/lists/listinfo/mplayer-cvslog



More information about the MPlayer-cvslog mailing list