[Mplayer-cvslog] CVS: main/libac3/mmx imdct_3dnow.c,NONE,1.1 srfft_3dnow.c,NONE,1.1
Nick Kurshev
nickols_k at users.sourceforge.net
Wed May 23 10:20:18 CEST 2001
Update of /cvsroot/mplayer/main/libac3/mmx
In directory usw-pr-cvs1:/tmp/cvs-serv11145/main/libac3/mmx
Added Files:
imdct_3dnow.c srfft_3dnow.c
Log Message:
initial 3dnow support
--- NEW FILE ---
/*
* imdct.c
*
* Copyright (C) Aaron Holtzman - May 1999
*
* This file is part of ac3dec, a free Dolby AC-3 stream decoder.
*
* ac3dec is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* ac3dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Modified for using AMD's 3DNow! - 3DNowEx(DSP)! SIMD operations
* by Nick Kurshev <nickols_k at mail.ru>
*/
/**
*
**/
void imdct_do_512(float data[],float delay[])
{
int i, j;
float tmp_a_r, tmp_a_i;
float *data_ptr;
float *delay_ptr;
float *window_ptr;
// 512 IMDCT with source and dest data in 'data'
// Pre IFFT complex multiply plus IFFT complex conjugate
for( i=0; i < 128; i++) {
j = pm128[i];
//a = (data[256-2*j-1] - data[2*j]) * (xcos1[j] + xsin1[j]);
//c = data[2*j] * xcos1[j];
//b = data[256-2*j-1] * xsin1[j];
//buf1[i].re = a - b + c;
//buf1[i].im = b + c;
buf[i].re = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
buf[i].im = -1.0 * (data[2*j] * xcos1[j] + data[256-2*j-1] * xsin1[j]);
}
fft_128p (&buf[0]);
// Post IFFT complex multiply plus IFFT complex conjugate
asm volatile ("femms":::"memory");
#ifndef HAVE_3DNOWEX
asm volatile ("movl $1, %%eax\n\t"
"movd %%eax, %%mm4\n\t"
"negl %%eax\n\t"
"movd %%eax, %%mm5\n\t"
"punpckldq %%mm5, %%mm4\n\t" /* 1.0 | -1.0 */
"pi2fd %%mm4, %%mm4\n\t":::"eax","memory");
#endif
for (i=0; i < 128; i++) {
asm volatile("movq %1, %%mm0\n\t" /* ac3_buf[i].re | ac3_buf[i].im */
"movq %%mm0, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */
#ifndef HAVE_3DNOWEX
"movq %%mm1, %%mm2\n\t"
"psrlq $32, %%mm1\n\t"
"punpckldq %%mm2, %%mm1\n\t"
#else
"pswapd %%mm1, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */
#endif
"movd %3, %%mm3\n\t" /* ac3_xsin[i] */
"punpckldq %2, %%mm3\n\t" /* ac3_xsin[i] | ac3_xcos[i] */
"pfmul %%mm3, %%mm0\n\t"
"pfmul %%mm3, %%mm1\n\t"
#ifndef HAVE_3DNOWEX
"pfmul %%mm4, %%mm0\n\t"
"pfacc %%mm1, %%mm0\n\t"
"movd %%mm0, 4%0\n\t"
"psrlq $32, %%mm0\n\t"
"movd %%mm0, %0\n\t"
#else
"pfpnacc %%mm1, %%mm0\n\t" /* mm0 = mm0[0] - mm0[1] | mm1[0] + mm1[1] */
"pswapd %%mm0, %%mm0\n\t"
"movq %%mm0, %0"
#endif
:"=m"(buf[i])
:"0"(buf[i]),"m"(xcos1[i]),"m"(xsin1[i])
:"memory");
/*
ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i]) + (tmp_a_i * ac3_xsin1[i]);
ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i]) - (tmp_a_i * ac3_xcos1[i]);
*/
}
asm volatile ("femms":::"memory");
data_ptr = data;
delay_ptr = delay;
window_ptr = window;
// Window and convert to real valued signal
for (i=0; i< 64; i++) {
*data_ptr++ = -buf[64+i].im * *window_ptr++ + *delay_ptr++;
*data_ptr++ = buf[64-i-1].re * *window_ptr++ + *delay_ptr++;
}
for(i=0; i< 64; i++) {
*data_ptr++ = -buf[i].re * *window_ptr++ + *delay_ptr++;
*data_ptr++ = buf[128-i-1].im * *window_ptr++ + *delay_ptr++;
}
// The trailing edge of the window goes into the delay line
delay_ptr = delay;
for(i=0; i< 64; i++) {
*delay_ptr++ = -buf[64+i].re * *--window_ptr;
*delay_ptr++ = buf[64-i-1].im * *--window_ptr;
}
for(i=0; i<64; i++) {
*delay_ptr++ = buf[i].im * *--window_ptr;
*delay_ptr++ = -buf[128-i-1].re * *--window_ptr;
}
}
/**
*
**/
void imdct_do_512_nol (float data[], float delay[])
{
int i, j;
float tmp_a_r, tmp_a_i;
float *data_ptr;
float *delay_ptr;
float *window_ptr;
//
// 512 IMDCT with source and dest data in 'data'
//
// Pre IFFT complex multiply plus IFFT cmplx conjugate
for( i=0; i < 128; i++)
{
/* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) */
j = pm128[i];
//a = (data[256-2*j-1] - data[2*j]) * (xcos1[j] + xsin1[j]);
//c = data[2*j] * xcos1[j];
//b = data[256-2*j-1] * xsin1[j];
//buf1[i].re = a - b + c;
//buf1[i].im = b + c;
buf[i].re = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
buf[i].im = -1.0 * (data[2*j] * xcos1[j] + data[256-2*j-1] * xsin1[j]);
}
fft_128p(&buf[0]);
/* Post IFFT complex multiply plus IFFT complex conjugate*/
for( i=0; i < 128; i++)
{
/* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
/* int j1 = i; */
tmp_a_r = buf[i].re;
tmp_a_i = buf[i].im;
//a = (tmp_a_r - tmp_a_i) * (xcos1[j] + xsin1[j]);
//b = tmp_a_r * xsin1[j];
//c = tmp_a_i * xcos1[j];
//buf[j].re = a - b + c;
//buf[j].im = b + c;
buf[i].re =(tmp_a_r * xcos1[i]) + (tmp_a_i * xsin1[i]);
buf[i].im =(tmp_a_r * xsin1[i]) - (tmp_a_i * xcos1[i]);
}
data_ptr = data;
delay_ptr = delay;
window_ptr = window;
/* Window and convert to real valued signal, no overlap here*/
for(i=0; i< 64; i++) {
*data_ptr++ = -buf[64+i].im * *window_ptr++;
*data_ptr++ = buf[64-i-1].re * *window_ptr++;
}
/* The trailing edge of the window goes into the delay line */
delay_ptr = delay;
for(i=0; i< 64; i++) {
*delay_ptr++ = -buf[64+i].re * *--window_ptr;
*delay_ptr++ = buf[64-i-1].im * *--window_ptr;
}
for(i=0; i<64; i++) {
*delay_ptr++ = buf[i].im * *--window_ptr;
*delay_ptr++ = -buf[128-i-1].re * *--window_ptr;
}
}
/**
*
**/
void imdct_do_256 (float data[],float delay[])
{
int i, j, k;
int p, q;
float tmp_a_i;
float tmp_a_r;
float *data_ptr;
float *delay_ptr;
float *window_ptr;
complex_t *buf1, *buf2;
buf1 = &buf[0];
buf2 = &buf[64];
// Pre IFFT complex multiply plus IFFT complex conjugate
for (k=0; k<64; k++) {
/* X1[k] = X[2*k] */
/* X2[k] = X[2*k+1] */
j = pm64[k];
p = 2 * (128-2*j-1);
q = 2 * (2 * j);
/* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
buf1[k].re = data[p] * xcos2[j] - data[q] * xsin2[j];
buf1[k].im = -1.0f * (data[q] * xcos2[j] + data[p] * xsin2[j]);
/* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
buf2[k].re = data[p + 1] * xcos2[j] - data[q + 1] * xsin2[j];
buf2[k].im = -1.0f * ( data[q + 1] * xcos2[j] + data[p + 1] * xsin2[j]);
}
fft_64p(&buf1[0]);
fft_64p(&buf2[0]);
#ifdef DEBUG
//DEBUG FFT
#if 0
printf ("Post FFT, buf1\n");
for (i=0; i < 64; i++)
printf("%d %f %f\n", i, buf_1[i].re, buf_1[i].im);
printf ("Post FFT, buf2\n");
for (i=0; i < 64; i++)
printf("%d %f %f\n", i, buf_2[i].re, buf_2[i].im);
#endif
#endif
// Post IFFT complex multiply
for( i=0; i < 64; i++) {
tmp_a_r = buf1[i].re;
tmp_a_i = -buf1[i].im;
buf1[i].re =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
buf1[i].im =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
tmp_a_r = buf2[i].re;
tmp_a_i = -buf2[i].im;
buf2[i].re =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
buf2[i].im =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
}
data_ptr = data;
delay_ptr = delay;
window_ptr = window;
/* Window and convert to real valued signal */
for(i=0; i< 64; i++) {
*data_ptr++ = -buf1[i].im * *window_ptr++ + *delay_ptr++;
*data_ptr++ = buf1[64-i-1].re * *window_ptr++ + *delay_ptr++;
}
for(i=0; i< 64; i++) {
*data_ptr++ = -buf1[i].re * *window_ptr++ + *delay_ptr++;
*data_ptr++ = buf1[64-i-1].im * *window_ptr++ + *delay_ptr++;
}
delay_ptr = delay;
for(i=0; i< 64; i++) {
*delay_ptr++ = -buf2[i].re * *--window_ptr;
*delay_ptr++ = buf2[64-i-1].im * *--window_ptr;
}
for(i=0; i< 64; i++) {
*delay_ptr++ = buf2[i].im * *--window_ptr;
*delay_ptr++ = -buf2[64-i-1].re * *--window_ptr;
}
}
/**
*
**/
void imdct_do_256_nol (float data[], float delay[])
{
int i, j, k;
int p, q;
float tmp_a_i;
float tmp_a_r;
float *data_ptr;
float *delay_ptr;
float *window_ptr;
complex_t *buf1, *buf2;
buf1 = &buf[0];
buf2 = &buf[64];
/* Pre IFFT complex multiply plus IFFT cmplx conjugate */
for(k=0; k<64; k++) {
/* X1[k] = X[2*k] */
/* X2[k] = X[2*k+1] */
j = pm64[k];
p = 2 * (128-2*j-1);
q = 2 * (2 * j);
/* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
buf1[k].re = data[p] * xcos2[j] - data[q] * xsin2[j];
buf1[k].im = -1.0f * (data[q] * xcos2[j] + data[p] * xsin2[j]);
/* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
buf2[k].re = data[p + 1] * xcos2[j] - data[q + 1] * xsin2[j];
buf2[k].im = -1.0f * ( data[q + 1] * xcos2[j] + data[p + 1] * xsin2[j]);
}
fft_64p(&buf1[0]);
fft_64p(&buf2[0]);
#ifdef DEBUG
//DEBUG FFT
#if 0
printf("Post FFT, buf1\n");
for (i=0; i < 64; i++)
printf("%d %f %f\n", i, buf_1[i].re, buf_1[i].im);
printf("Post FFT, buf2\n");
for (i=0; i < 64; i++)
printf("%d %f %f\n", i, buf_2[i].re, buf_2[i].im);
#endif
#endif
/* Post IFFT complex multiply */
for( i=0; i < 64; i++) {
/* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
tmp_a_r = buf1[i].re;
tmp_a_i = -buf1[i].im;
buf1[i].re =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
buf1[i].im =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
/* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
tmp_a_r = buf2[i].re;
tmp_a_i = -buf2[i].im;
buf2[i].re =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
buf2[i].im =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
}
data_ptr = data;
delay_ptr = delay;
window_ptr = window;
/* Window and convert to real valued signal, no overlap */
for(i=0; i< 64; i++) {
*data_ptr++ = -buf1[i].im * *window_ptr++;
*data_ptr++ = buf1[64-i-1].re * *window_ptr++;
}
for(i=0; i< 64; i++) {
*data_ptr++ = -buf1[i].re * *window_ptr++ + *delay_ptr++;
*data_ptr++ = buf1[64-i-1].im * *window_ptr++ + *delay_ptr++;
}
delay_ptr = delay;
for(i=0; i< 64; i++) {
*delay_ptr++ = -buf2[i].re * *--window_ptr;
*delay_ptr++ = buf2[64-i-1].im * *--window_ptr;
}
for(i=0; i< 64; i++) {
*delay_ptr++ = buf2[i].im * *--window_ptr;
*delay_ptr++ = -buf2[64-i-1].re * *--window_ptr;
}
}
--- NEW FILE ---
/*
* srfft.c
*
* Copyright (C) Yuqing Deng <Yuqing_Deng at brown.edu> - April 2000
*
* 64 and 128 point split radix fft for ac3dec
*
* The algorithm is desribed in the book:
* "Computational Frameworks of the Fast Fourier Transform".
*
* The ideas and the the organization of code borrowed from djbfft written by
* D. J. Bernstein <djb at cr.py.to>. djbff can be found at
* http://cr.yp.to/djbfft.html.
*
* srfft.c is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* srfft.c is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
*/
/*
* Modified for using AMD's 3DNow! - 3DNowEx(DSP)! SIMD operations support
* by Nick Kurshev <nickols_k at mail.ru>
*/
void fft_4(complex_t *x)
{
/* delta_p = 1 here */
/* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
*/
register complex_t yt, yb, u, vi;
yt.re = x[0].re;
yb.re = yt.re - x[2].re;
yt.re += x[2].re;
u.re = x[1].re;
vi.im = x[3].re - u.re;
u.re += x[3].re;
u.im = x[1].im;
vi.re = u.im - x[3].im;
u.im += x[3].im;
yt.im = yt.re;
yt.im += u.re;
x[0].re = yt.im;
yt.re -= u.re;
x[2].re = yt.re;
yt.im = yb.re;
yt.im += vi.re;
x[1].re = yt.im;
yb.re -= vi.re;
x[3].re = yb.re;
yt.im = x[0].im;
yb.im = yt.im - x[2].im;
yt.im += x[2].im;
yt.re = yt.im;
yt.re += u.im;
x[0].im = yt.re;
yt.im -= u.im;
x[2].im = yt.im;
yt.re = yb.im;
yt.re += vi.im;
x[1].im = yt.re;
yb.im -= vi.im;
x[3].im = yb.im;
}
#if 0
/* is never called */
void fftu_4(complex_t *x)
{
/* delta_p = 1 here */
/* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{2*pi/4}
*/
register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
yt_r = x[0].re;
yb_r = yt_r - x[2].re;
yt_r += x[2].re;
u_r = x[1].re;
vi_i = x[3].re - u_r;
u_r += x[3].re;
u_i = x[1].im;
vi_r = u_i - x[3].im;
u_i += x[3].im;
yt_i = yt_r;
yt_i += u_r;
x[0].re = yt_i;
yt_r -= u_r;
x[2].re = yt_r;
yt_i = yb_r;
yt_i += vi_r;
x[3].re = yt_i;
yb_r -= vi_r;
x[1].re = yb_r;
yt_i = x[0].im;
yb_i = yt_i - x[2].im;
yt_i += x[2].im;
yt_r = yt_i;
yt_r += u_i;
x[0].im = yt_r;
yt_i -= u_i;
x[2].im = yt_i;
yt_r = yb_i;
yt_r += vi_i;
x[3].im = yt_r;
yb_i -= vi_i;
x[1].im = yb_i;
}
#endif
void fft_8(complex_t *x)
{
/* delta_p = diag{1, sqrt(i)} here */
/* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
*/
complex_t wT1, wB1, wT2, wB2;
asm(
"movq 8%0, %%mm0\n\t"
"movq 24%0, %%mm1\n\t"
"movq %%mm0, %1\n\t" /* wT1 = x[1]; */
"movq %%mm1, %2\n\t" /* wB1 = x[3]; */
:"=m"(*x),"=m"(wT1), "=m"(wB1)
:"0"(*x));
asm(
"movq 16%0, %%mm0\n\t"
"movq 32%0, %%mm1\n\t"
"movq 48%0, %%mm2\n\t"
"movq %%mm0, 8%0\n\t" /* x[1] = x[2]; */
"movq %%mm1, 16%0\n\t" /* x[2] = x[4]; */
"movq %%mm2, 24%0\n\t" /* x[3] = x[6]; */
:"=m"(*x)
:"0"(*x));
asm volatile("femms":::"memory");
fft_4(&x[0]);
/* x[0] x[4] */
wT2.re = x[5].re;
wT2.re += x[7].re;
wT2.re += wT1.re;
wT2.re += wB1.re;
wT2.im = wT2.re;
wT2.re += x[0].re;
wT2.im = x[0].re - wT2.im;
x[0].re = wT2.re;
x[4].re = wT2.im;
wT2.im = x[5].im;
wT2.im += x[7].im;
wT2.im += wT1.im;
wT2.im += wB1.im;
wT2.re = wT2.im;
wT2.re += x[0].im;
wT2.im = x[0].im - wT2.im;
x[0].im = wT2.re;
x[4].im = wT2.im;
/* x[2] x[6] */
wT2.re = x[5].im;
wT2.re -= x[7].im;
wT2.re += wT1.im;
wT2.re -= wB1.im;
wT2.im = wT2.re;
wT2.re += x[2].re;
wT2.im = x[2].re - wT2.im;
x[2].re = wT2.re;
x[6].re = wT2.im;
wT2.im = x[5].re;
wT2.im -= x[7].re;
wT2.im += wT1.re;
wT2.im -= wB1.re;
wT2.re = wT2.im;
wT2.re += x[2].im;
wT2.im = x[2].im - wT2.im;
x[2].im = wT2.im;
x[6].im = wT2.re;
/* x[1] x[5] */
wT2.re = wT1.re;
wT2.re += wB1.im;
wT2.re -= x[5].re;
wT2.re -= x[7].im;
wT2.im = wT1.im;
wT2.im -= wB1.re;
wT2.im -= x[5].im;
wT2.im += x[7].re;
wB2.re = wT2.re;
wB2.re += wT2.im;
wT2.im -= wT2.re;
wB2.re *= HSQRT2;
wT2.im *= HSQRT2;
wT2.re = wB2.re;
wB2.re += x[1].re;
wT2.re = x[1].re - wT2.re;
wB2.im = x[5].re;
x[1].re = wB2.re;
x[5].re = wT2.re;
wT2.re = wT2.im;
wT2.re += x[1].im;
wT2.im = x[1].im - wT2.im;
wB2.re = x[5].im;
x[1].im = wT2.re;
x[5].im = wT2.im;
/* x[3] x[7] */
wT1.re -= wB1.im;
wT1.im += wB1.re;
wB1.re = wB2.im - x[7].im;
wB1.im = wB2.re + x[7].re;
wT1.re -= wB1.re;
wT1.im -= wB1.im;
wB1.re = wT1.re + wT1.im;
wB1.re *= HSQRT2;
wT1.im -= wT1.re;
wT1.im *= HSQRT2;
wB2.re = x[3].re;
wB2.im = wB2.re + wT1.im;
wB2.re -= wT1.im;
x[3].re = wB2.im;
x[7].re = wB2.re;
wB2.im = x[3].im;
wB2.re = wB2.im + wB1.re;
wB2.im -= wB1.re;
x[3].im = wB2.im;
x[7].im = wB2.re;
}
#if 0
/* is never called */
void fftu_8(complex_t *x)
{
/* delta_p = diag{1, sqrt(i)} here */
/* this function computes x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{2*pi/8}
*/
register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
/* 2 F_2 on x(1:4:7) and x(3:4:7) */
/* wB2 is weighted by i */
wT1_r = x[1].re;
wT1_i = x[1].im;
wB1_r = x[3].re;
wB1_i = x[3].im;
/* 1 F_4 on x(0:2:7) */
x[1] = x[2];
x[2] = x[4];
x[3] = x[6];
fft_4(&x[0]);
/* x[0] x[4] */
wT2_r = x[5].re;
wT2_r += x[7].re;
wT2_r += wT1_r;
wT2_r += wB1_r;
wT2_i = wT2_r;
wT2_r += x[0].re;
wT2_i = x[0].re - wT2_i;
x[0].re = wT2_r;
x[4].re = wT2_i;
wT2_i = x[5].im;
wT2_i += x[7].im;
wT2_i += wT1_i;
wT2_i += wB1_i;
wT2_r = wT2_i;
wT2_r += x[0].im;
wT2_i = x[0].im - wT2_i;
x[0].im = wT2_r;
x[4].im = wT2_i;
/* x[2] x[6] */
wT2_r = x[5].im;
wT2_r -= x[7].im;
wT2_r += wT1_i;
wT2_r -= wB1_i;
wT2_i = wT2_r;
wT2_r += x[2].re;
wT2_i = x[2].re - wT2_i;
x[2].re = wT2_i;
x[6].re = wT2_r;
wT2_i = x[5].re;
wT2_i -= x[7].re;
wT2_i += wT1_r;
wT2_i -= wB1_r;
wT2_r = wT2_i;
wT2_r += x[2].im;
wT2_i = x[2].im - wT2_i;
x[2].im = wT2_r;
x[6].im = wT2_i;
/* x[1] x[5] */
wT2_r = wT1_r;
wT2_r -= wB1_i;
wT2_r -= x[5].re;
wT2_r += x[7].im;
wT2_i = wT1_i;
wT2_i += wB1_r;
wT2_i -= x[5].im;
wT2_i -= x[7].re;
wB2_r = wT2_r;
wB2_r -= wT2_i;
wT2_i += wT2_r;
wB2_r *= HSQRT2;
wT2_i *= HSQRT2;
wT2_r = wB2_r;
wB2_r += x[1].re;
wT2_r = x[1].re - wT2_r;
wB2_i = x[5].re;
x[1].re = wB2_r;
x[5].re = wT2_r;
wT2_r = wT2_i;
wT2_r += x[1].im;
wT2_i = x[1].im - wT2_i;
wB2_r = x[5].im;
x[1].im = wT2_r;
x[5].im = wT2_i;
/* x[3] x[7] */
wT1_r += wB1_i;
wT1_i -= wB1_r;
wB1_r = wB2_i + x[7].im;
wB1_i = wB2_r - x[7].re;
wT1_r -= wB1_r;
wT1_i -= wB1_i;
wB1_r = wT1_r - wT1_i;
wB1_r *= HSQRT2;
wT1_i += wT1_r;
wT1_i *= HSQRT2;
wB2_r = x[3].re;
wB2_i = wB2_r + wT1_i;
wB2_r -= wT1_i;
x[3].re = wB2_r;
x[7].re = wB2_i;
wB2_i = x[3].im;
wB2_r = wB2_i + wB1_r;
wB2_i -= wB1_r;
x[3].im = wB2_r;
x[7].im = wB2_i;
}
#endif
void fft_asmb(int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3)
{
register complex_t *x2k, *x3k, *x4k, *wB;
register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
x2k = x + 2 * k;
x3k = x2k + 2 * k;
x4k = x3k + 2 * k;
wB = wTB + 2 * k;
TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
--k;
for(;;) {
TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
if (!--k) break;
x += 2;
x2k += 2;
x3k += 2;
x4k += 2;
d += 2;
d_3 += 2;
wTB += 2;
wB += 2;
}
}
void fft_asmb16(complex_t *x, complex_t *wTB)
{
register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
int k = 2;
/* transform x[0], x[8], x[4], x[12] */
TRANSZERO(x[0],x[4],x[8],x[12]);
/* transform x[1], x[9], x[5], x[13] */
TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
/* transform x[2], x[10], x[6], x[14] */
TRANSHALF_16(x[2],x[6],x[10],x[14]);
/* transform x[3], x[11], x[7], x[15] */
TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
}
_______________________________________________
Mplayer-cvslog mailing list
Mplayer-cvslog at lists.sourceforge.net
http://lists.sourceforge.net/lists/listinfo/mplayer-cvslog
More information about the MPlayer-cvslog
mailing list