[Mplayer-cvslog] CVS: main/libac3/mmx srfftp_3dnow.h,NONE,1.1 imdct_3dnow.c,1.2,1.3 srfft_3dnow.c,1.4,1.5

Nick Kurshev nickols_k at users.sourceforge.net
Sat May 26 12:29:43 CEST 2001


Update of /cvsroot/mplayer/main/libac3/mmx
In directory usw-pr-cvs1:/tmp/cvs-serv1933/main/libac3/mmx

Modified Files:
	imdct_3dnow.c srfft_3dnow.c 
Added Files:
	srfftp_3dnow.h 
Log Message:
Improvements

--- NEW FILE ---
/* 
 *  srfftp.h
 *
 *  Copyright (C) Yuqing Deng <Yuqing_Deng at brown.edu> - April 2000
 *
 *  64 and 128 point split radix fft for ac3dec
 *
 *  The algorithm is desribed in the book:
 *  "Computational Frameworks of the Fast Fourier Transform".
 *
 *  The ideas and the the organization of code borrowed from djbfft written by
 *  D. J. Bernstein <djb at cr.py.to>.  djbff can be found at 
 *  http://cr.yp.to/djbfft.html.
 *
 *  srfftp.h is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  srfftp.h is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *  Modified for using AMD's 3DNow! - 3DNowEx(DSP)! SIMD operations 
 *  by Nick Kurshev <nickols_k at mail.ru>
 */

#ifndef SRFFTP_3DNOW_H__
#define SRFFTP_3DNOW_H__

#ifdef HAVE_3DNOWEX
#define TRANS_FILL_MM6_MM7_3DNOW()\
    asm(\
	"movl	$-1, %%eax\n\t"\
	"movd	%%eax, %%mm7\n\t"\
	"negl	%%eax\n\t"\
	"movd	%%eax, %%mm6\n\t"\
	"punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
	"pi2fd	%%mm7, %%mm7\n\t"\
	"pswapd	%%mm7, %%mm6\n\t"/* 1.0 | -1.0 */\
	:::"eax","memory");
#else
#define TRANS_FILL_MM6_MM7_3DNOW()\
    asm(\
	"movl	$-1, %%eax\n\t"\
	"movd	%%eax, %%mm7\n\t"\
	"negl	%%eax\n\t"\
	"movd	%%eax, %%mm6\n\t"\
	"punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
	"punpckldq %%mm7, %%mm6\n\t" /* 1.0 | -1.0 */\
	"pi2fd	%%mm7, %%mm7\n\t"\
	"pi2fd	%%mm6, %%mm6\n\t"\
	:::"eax","memory");
#endif

#ifdef HAVE_3DNOWEX
#define PSWAP_MM(mm_base,mm_hlp) "pswapd	"##mm_base","##mm_base" \n\t"
#else
#define PSWAP_MM(mm_base,mm_hlp)\
	"movq	"##mm_base","##mm_hlp" \n\t"\
	"psrlq $32, "##mm_base"\n\t"\
	"punpckldq "##mm_hlp","##mm_base"\n\t"
#endif

#define TRANSZERO_3DNOW(A0,A4,A8,A12) \
{ \
    asm volatile("femms":::"memory");\
    TRANS_FILL_MM6_MM7_3DNOW()\
    asm(\
	"movq	%4, %%mm0\n\t" /* mm0 = wTB[0]*/\
	"movq	%5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \
	"movq	%%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\
	"pfadd	%%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\
	"pfmul  %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\
	"pfmul	%%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\
	"pfadd	%%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\
	"movq	%%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\
	PSWAP_MM("%%mm4","%%mm2")/* mm4 = v*/\
	"movq	%6, %%mm0\n\t" /* a1 = A0;*/\
	"movq	%%mm0, %%mm1\n\t"\
	"pfadd	%%mm5, %%mm0\n\t" /*A0 = a1 + u;*/\
	"pfsub	%%mm5, %%mm1\n\t" /*A1 = a1 - u;*/\
	"movq	%%mm0, %0\n\t"\
	"movq	%%mm1, %1\n\t"\
	"movq	%7, %%mm2\n\t" /* a1 = A4;*/\
	"movq	%%mm2, %%mm3\n\t"\
	"pfadd	%%mm4, %%mm2\n\t" /*A12 = a1 + v;*/\
	"pfsub	%%mm4, %%mm3\n\t" /*A4  = a1 - v;*/\
	"movq	%%mm2, %3\n\t"\
	"movq	%%mm3, %2"\
	:"=m"(A0), "=m"(A8), "=m"(A4), "=m"(A12)\
	:"m"(wTB[0]), "m"(wTB[k*2]), "0"(A0), "2"(A4)\
	:"memory");\
    asm volatile("femms":::"memory");\
}

#endif

Index: imdct_3dnow.c
===================================================================
RCS file: /cvsroot/mplayer/main/libac3/mmx/imdct_3dnow.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** imdct_3dnow.c	2001/05/24 09:43:49	1.2
--- imdct_3dnow.c	2001/05/26 10:29:41	1.3
***************
*** 31,35 ****
  {
  	int i, j;
- //	float tmp_a_r, tmp_a_i;
  	float *data_ptr;
  	float *delay_ptr;
--- 31,34 ----
***************
*** 76,83 ****
  		:"m"(data[256-2*j-1]), "m"(data[2*j]), "m"(xcos1[j]), "m"(xsin1[j])
  		:"memory");
! /*
! 		buf[i].re = (data[256-2*j-1] * xcos1[j] - data[2*j] * xsin1[j]);
! 		buf[i].im = (data[256-2*j-1] * xsin1[j] + data[2*j] * xcos1[j])*(-1.0);
! */
  	}
  
--- 75,80 ----
  		:"m"(data[256-2*j-1]), "m"(data[2*j]), "m"(xcos1[j]), "m"(xsin1[j])
  		:"memory");
! /*		buf[i].re = (data[256-2*j-1] * xcos1[j] - data[2*j] * xsin1[j]);
! 		buf[i].im = (data[256-2*j-1] * xsin1[j] + data[2*j] * xcos1[j])*(-1.0);*/
  	}
  
***************
*** 132,139 ****
  		:"0"(buf[i]),"m"(xcos1[i]),"m"(xsin1[i])
  		:"memory");
! /*
! 		ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i])  +  (tmp_a_i  * ac3_xsin1[i]);
! 		ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i])  -  (tmp_a_i  * ac3_xcos1[i]);
! */
  	}
  
--- 129,134 ----
  		:"0"(buf[i]),"m"(xcos1[i]),"m"(xsin1[i])
  		:"memory");
! /*		ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i])  +  (tmp_a_i  * ac3_xsin1[i]);
! 		ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i])  -  (tmp_a_i  * ac3_xcos1[i]);*/
  	}
  
***************
*** 141,245 ****
  	delay_ptr = delay;
  	window_ptr = window;
- 
  // Window and convert to real valued signal
  	for (i=0; i< 64; i++) {
! 
  	asm volatile(
! 		"movd	%1, %%mm0\n\t"
! 		"punpckldq %2, %%mm0\n\t"
! 		"pfmul	%3, %%mm0\n\t"
! 		"pfmul	%%mm6, %%mm0\n\t"
! 		"pfadd	%4, %%mm0\n\t"
! 		"movq	%%mm0, %0"
! 		:"=m"(*data_ptr)
! 		:"m"(buf[64+i].im), "m"(buf[64-i-1].re), "m"(*window_ptr), "m"(*delay_ptr)
  		:"memory");
  		data_ptr += 2;
  		window_ptr += 2;
  		delay_ptr += 2;
- 
- /*              
- 		*data_ptr++   = -buf[64+i].im   * *window_ptr++ + *delay_ptr++;
- 		*data_ptr++   = buf[64-i-1].re * *window_ptr++ + *delay_ptr++;
- */
- 	}
- 
- 	for(i=0; i< 64; i++) {
- 
- 	asm volatile(
- 		"movd	%1, %%mm0\n\t"
- 		"punpckldq %2, %%mm0\n\t"
- 		"pfmul	%3, %%mm0\n\t"
- 		"pfmul	%%mm6, %%mm0\n\t"
- 		"pfadd	%4, %%mm0\n\t"
- 		"movq	%%mm0, %0"
- 		:"=m"(*data_ptr)
- 		:"m"(buf[i].re), "m"(buf[128-i-1].im), "m"(*window_ptr), "m"(*delay_ptr)
- 		:"memory");
- 		data_ptr += 2;
- 		window_ptr += 2;
- 		delay_ptr += 2;
- 
- /*
- 		*data_ptr++  = -buf[i].re       * *window_ptr++ + *delay_ptr++;
- 		*data_ptr++  = buf[128-i-1].im * *window_ptr++ + *delay_ptr++;
- */
  	}
! 
  // The trailing edge of the window goes into the delay line
  	delay_ptr = delay;
- 
  	for(i=0; i< 64; i++) {
! 
  	    window_ptr -=2;
  	    asm volatile(
! 		"movd	%1, %%mm0\n\t"
! 		"punpckldq %2, %%mm0\n\t"
  #ifdef HAVE_3DNOWEX
! 		"pswapd	%3, %%mm3\n\t"
  #else
! 		"movq	%3, %%mm3\n\t"
! 		"psrlq	$32, %%mm3\n\t"
! 		"punpckldq %3, %%mm3\n\t"
  #endif
  		"pfmul	%%mm3, %%mm0\n\t"
  		"pfmul	%%mm6, %%mm0\n\t"
! 		"movq	%%mm0, %0"
! 		:"=m"(*delay_ptr)
! 		:"m"(buf[64+i].re), "m"(buf[64-i-1].im), "m"(*window_ptr)
! 		:"memory");
! 		delay_ptr += 2;
! /*
! 		window_ptr--;
! 		*delay_ptr++  = -buf[64+i].re   * *window_ptr;
! 		window_ptr--;
! 		*delay_ptr++  =  buf[64-i-1].im * *window_ptr;
! */
! 	}
! 
! 	for(i=0; i<64; i++) {
! 	window_ptr -= 2;
! 	asm volatile(
! 		"movd	%1, %%mm0\n\t"
! 		"punpckldq %2, %%mm0\n\t"
! #ifdef HAVE_3DNOWEX
! 		"pswapd	%3, %%mm3\n\t"
! #else
! 		"movq	%3, %%mm3\n\t"
! 		"psrlq	$32, %%mm3\n\t"
! 		"punpckldq %3, %%mm3\n\t"
! #endif
! 		"pfmul	%%mm3, %%mm0\n\t"
! 		"pfmul	%%mm7, %%mm0\n\t"
! 		"movq	%%mm0, %0"
! 		:"=m"(*delay_ptr)
! 		:"m"(buf[i].im), "m"(buf[128-i-1].re), "m"(*window_ptr)
  		:"memory");
  		delay_ptr += 2;
- 
- /*
- 		*delay_ptr++  =  buf[i].im       * *--window_ptr;
- 		*delay_ptr++  = -buf[128-i-1].re * *--window_ptr;
- */
  	}
    asm volatile ("femms":::"memory");
--- 136,194 ----
  	delay_ptr = delay;
  	window_ptr = window;
  // Window and convert to real valued signal
  	for (i=0; i< 64; i++) {
! /* merge two loops in one to enable working of 2 decoders */
  	asm volatile(
! 		"movd	516(%1), %%mm0\n\t"
! 		"movd	(%1), %%mm1\n\t" /**data_ptr++=-buf[64+i].im**window_ptr+++*delay_ptr++;*/
! 		"punpckldq (%2), %%mm0\n\t"/*data_ptr[128]=-buf[i].re*window_ptr[128]+delay_ptr[128];*/
! 		"punpckldq 516(%2), %%mm1\n\t"
! 		"pfmul	(%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/
! 		"pfmul	512(%3), %%mm1\n\t"
! 		"pfmul	%%mm6, %%mm0\n\t"/*data_ptr[128]=buf[128-i-1].im*window_ptr[128]+delay_ptr[128];*/
! 		"pfmul	%%mm6, %%mm1\n\t"
! 		"pfadd	(%4), %%mm0\n\t"
! 		"pfadd	512(%4), %%mm1\n\t"
! 		"movq	%%mm0, (%0)\n\t"
! 		"movq	%%mm1, 512(%0)"
! 		:"=r"(data_ptr)
! 		:"r"(&buf[i].re), "r"(&buf[64-i-1].re), "r"(window_ptr), "r"(delay_ptr), "0"(data_ptr)
  		:"memory");
  		data_ptr += 2;
  		window_ptr += 2;
  		delay_ptr += 2;
  	}
! 	window_ptr += 128;
  // The trailing edge of the window goes into the delay line
  	delay_ptr = delay;
  	for(i=0; i< 64; i++) {
! /* merge two loops in one to enable working of 2 decoders */
  	    window_ptr -=2;
  	    asm volatile(
! 		"movd	508(%1), %%mm0\n\t"
! 		"movd	(%1), %%mm1\n\t"
! 		"punpckldq (%2), %%mm0\n\t"
! 		"punpckldq 508(%2), %%mm1\n\t"
  #ifdef HAVE_3DNOWEX
! 		"pswapd	(%3), %%mm3\n\t"
! 		"pswapd	-512(%3), %%mm4\n\t"
  #else
! 		"movq	(%3), %%mm3\n\t"/**delay_ptr++=-buf[64+i].re**--window_ptr;*/
! 		"movq	-512(%3), %%mm4\n\t"
! 		"psrlq	$32, %%mm3\n\t"/*delay_ptr[128]=buf[i].im**window_ptr[-512];*/
! 		"psrlq	$32, %%mm4\n\t"/**delay_ptr++=buf[64-i-1].im**--window_ptr;*/
! 		"punpckldq (%3), %%mm3\n\t"/*delay_ptr[128]=-buf[128-i-1].re**window_ptr[-512];*/
! 		"punpckldq -512(%3), %%mm4\n\t"
  #endif
  		"pfmul	%%mm3, %%mm0\n\t"
+ 		"pfmul	%%mm4, %%mm1\n\t"
  		"pfmul	%%mm6, %%mm0\n\t"
! 		"pfmul	%%mm7, %%mm1\n\t"
! 		"movq	%%mm0, (%0)\n\t"
! 		"movq	%%mm1, 512(%0)"
! 		:"=r"(delay_ptr)
! 		:"r"(&buf[i].im), "r"(&buf[64-i-1].im), "r"(window_ptr), "0"(delay_ptr)
  		:"memory");
  		delay_ptr += 2;
  	}
    asm volatile ("femms":::"memory");

Index: srfft_3dnow.c
===================================================================
RCS file: /cvsroot/mplayer/main/libac3/mmx/srfft_3dnow.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -r1.4 -r1.5
*** srfft_3dnow.c	2001/05/24 09:43:49	1.4
--- srfft_3dnow.c	2001/05/26 10:29:41	1.5
***************
*** 30,33 ****
--- 30,34 ----
   *  by Nick Kurshev <nickols_k at mail.ru>
   */
+ #include "mmx/srfftp_3dnow.h"
  
  void fft_4(complex_t *x)
***************
*** 394,398 ****
    wB = wTB + 2 * k;
    
!   TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
    TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
    
--- 395,399 ----
    wB = wTB + 2 * k;
    
!   TRANSZERO_3DNOW(x[0],x2k[0],x3k[0],x4k[0]);
    TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
    
***************
*** 420,424 ****
  
    /* transform x[0], x[8], x[4], x[12] */
!   TRANSZERO(x[0],x[4],x[8],x[12]);
  
    /* transform x[1], x[9], x[5], x[13] */
--- 421,425 ----
  
    /* transform x[0], x[8], x[4], x[12] */
!   TRANSZERO_3DNOW(x[0],x[4],x[8],x[12]);
  
    /* transform x[1], x[9], x[5], x[13] */


_______________________________________________
Mplayer-cvslog mailing list
Mplayer-cvslog at lists.sourceforge.net
http://lists.sourceforge.net/lists/listinfo/mplayer-cvslog



More information about the MPlayer-cvslog mailing list