[Mplayer-cvslog] CVS: main/liba52 imdct.c,1.11,1.12 Makefile,1.3,1.4 test.c,1.4,1.5
Michael Niedermayer
michael at mplayer.dev.hu
Tue Dec 18 05:00:31 CET 2001
Update of /cvsroot/mplayer/main/liba52
In directory mplayer:/var/tmp.root/cvs-serv13920
Modified Files:
imdct.c Makefile test.c
Log Message:
runtime cpu detection for the idct
Index: imdct.c
===================================================================
RCS file: /cvsroot/mplayer/main/liba52/imdct.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -r1.11 -r1.12
--- imdct.c 17 Dec 2001 03:30:08 -0000 1.11
+++ imdct.c 18 Dec 2001 04:00:29 -0000 1.12
@@ -19,9 +19,12 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * SSE optimizations from Michael Niedermayer (michaelni at gmx.at)
*/
#include "config.h"
+#include "../cpudetect.h"
#include <math.h>
#include <stdio.h>
@@ -72,7 +75,7 @@
0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
-#ifdef HAVE_SSE
+#ifdef ARCH_X86
// NOTE: SSE needs 16byte alignment or it will segfault
static complex_t __attribute__((aligned(16))) buf[128];
static float __attribute__((aligned(16))) sseSinCos1a[256];
@@ -185,7 +188,194 @@
/* 512 IMDCT with source and dest data in 'data' */
-#ifdef HAVE_SSE
+ /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
+ for( i=0; i < 128; i++) {
+ /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
+ buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]);
+ buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i]));
+ }
+
+ /* Bit reversed shuffling */
+ for(i=0; i<128; i++) {
+ k = bit_reverse_512[i];
+ if (k < i)
+ swap_cmplx(&buf[i],&buf[k]);
+ }
+
+
+ /* FFT Merge */
+/* unoptimized variant
+ for (m=1; m < 7; m++) {
+ if(m)
+ two_m = (1 << m);
+ else
+ two_m = 1;
+
+ two_m_plus_one = (1 << (m+1));
+
+ for(i = 0; i < 128; i += two_m_plus_one) {
+ for(k = 0; k < two_m; k++) {
+ p = k + i;
+ q = p + two_m;
+ tmp_a_r = buf[p].real;
+ tmp_a_i = buf[p].imag;
+ tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
+ tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
+ buf[p].real = tmp_a_r + tmp_b_r;
+ buf[p].imag = tmp_a_i + tmp_b_i;
+ buf[q].real = tmp_a_r - tmp_b_r;
+ buf[q].imag = tmp_a_i - tmp_b_i;
+ }
+ }
+ }
+*/
+
+ for(i = 0; i < 128; i += 2) {
+ tmp_a_r = buf[i].real;
+ tmp_a_i = buf[i].imag;
+ tmp_b_r = buf[i+1].real;
+ tmp_b_i = buf[i+1].imag;
+ buf[i].real = tmp_a_r + tmp_b_r;
+ buf[i].imag = tmp_a_i + tmp_b_i;
+ buf[i+1].real = tmp_a_r - tmp_b_r;
+ buf[i+1].imag = tmp_a_i - tmp_b_i;
+ }
+
+ // Note w[1]={{1,0}, {0,-1}}
+ for(i = 0; i < 128; i += 4) {
+ tmp_a_r = buf[i].real;
+ tmp_a_i = buf[i].imag;
+ tmp_b_r = buf[i+2].real;
+ tmp_b_i = buf[i+2].imag;
+ buf[i].real = tmp_a_r + tmp_b_r;
+ buf[i].imag = tmp_a_i + tmp_b_i;
+ buf[i+2].real = tmp_a_r - tmp_b_r;
+ buf[i+2].imag = tmp_a_i - tmp_b_i;
+ tmp_a_r = buf[i+1].real;
+ tmp_a_i = buf[i+1].imag;
+ tmp_b_r = buf[i+3].imag;
+ tmp_b_i = buf[i+3].real;
+ buf[i+1].real = tmp_a_r + tmp_b_r;
+ buf[i+1].imag = tmp_a_i - tmp_b_i;
+ buf[i+3].real = tmp_a_r - tmp_b_r;
+ buf[i+3].imag = tmp_a_i + tmp_b_i;
+ }
+
+ for(i = 0; i < 128; i += 8) {
+ tmp_a_r = buf[i].real;
+ tmp_a_i = buf[i].imag;
+ tmp_b_r = buf[i+4].real;
+ tmp_b_i = buf[i+4].imag;
+ buf[i].real = tmp_a_r + tmp_b_r;
+ buf[i].imag = tmp_a_i + tmp_b_i;
+ buf[i+4].real = tmp_a_r - tmp_b_r;
+ buf[i+4].imag = tmp_a_i - tmp_b_i;
+ tmp_a_r = buf[1+i].real;
+ tmp_a_i = buf[1+i].imag;
+ tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
+ tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
+ buf[1+i].real = tmp_a_r + tmp_b_r;
+ buf[1+i].imag = tmp_a_i + tmp_b_i;
+ buf[i+5].real = tmp_a_r - tmp_b_r;
+ buf[i+5].imag = tmp_a_i - tmp_b_i;
+ tmp_a_r = buf[i+2].real;
+ tmp_a_i = buf[i+2].imag;
+ tmp_b_r = buf[i+6].imag;
+ tmp_b_i = - buf[i+6].real;
+ buf[i+2].real = tmp_a_r + tmp_b_r;
+ buf[i+2].imag = tmp_a_i + tmp_b_i;
+ buf[i+6].real = tmp_a_r - tmp_b_r;
+ buf[i+6].imag = tmp_a_i - tmp_b_i;
+ tmp_a_r = buf[i+3].real;
+ tmp_a_i = buf[i+3].imag;
+ tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
+ tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
+ buf[i+3].real = tmp_a_r + tmp_b_r;
+ buf[i+3].imag = tmp_a_i + tmp_b_i;
+ buf[i+7].real = tmp_a_r - tmp_b_r;
+ buf[i+7].imag = tmp_a_i - tmp_b_i;
+ }
+
+ for (m=3; m < 7; m++) {
+ two_m = (1 << m);
+
+ two_m_plus_one = two_m<<1;
+
+ for(i = 0; i < 128; i += two_m_plus_one) {
+ for(k = 0; k < two_m; k++) {
+ int p = k + i;
+ int q = p + two_m;
+ tmp_a_r = buf[p].real;
+ tmp_a_i = buf[p].imag;
+ tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
+ tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
+ buf[p].real = tmp_a_r + tmp_b_r;
+ buf[p].imag = tmp_a_i + tmp_b_i;
+ buf[q].real = tmp_a_r - tmp_b_r;
+ buf[q].imag = tmp_a_i - tmp_b_i;
+ }
+ }
+ }
+
+ /* Post IFFT complex multiply plus IFFT complex conjugate*/
+ for( i=0; i < 128; i++) {
+ /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
+ tmp_a_r = buf[i].real;
+ tmp_a_i = -1.0 * buf[i].imag;
+ buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]);
+ buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]);
+ }
+
+ data_ptr = data;
+ delay_ptr = delay;
+ window_ptr = imdct_window;
+
+ /* Window and convert to real valued signal */
+ for(i=0; i< 64; i++) {
+ *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
+ *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
+ }
+
+ for(i=0; i< 64; i++) {
+ *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
+ *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
+ }
+
+ /* The trailing edge of the window goes into the delay line */
+ delay_ptr = delay;
+
+ for(i=0; i< 64; i++) {
+ *delay_ptr++ = -buf[64+i].real * *--window_ptr;
+ *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
+ }
+
+ for(i=0; i<64; i++) {
+ *delay_ptr++ = buf[i].imag * *--window_ptr;
+ *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
+ }
+}
+
+#ifdef ARCH_X86
+void
+imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
+{
+ int i,k;
+ int p,q;
+ int m;
+ int two_m;
+ int two_m_plus_one;
+
+ sample_t tmp_a_i;
+ sample_t tmp_a_r;
+ sample_t tmp_b_i;
+ sample_t tmp_b_r;
+
+ sample_t *data_ptr;
+ sample_t *delay_ptr;
+ sample_t *window_ptr;
+
+ /* 512 IMDCT with source and dest data in 'data' */
+
/* Pre IFFT complex multiply plus IFFT cmplx conjugate */
/* Bit reversed shuffling */
asm volatile(
@@ -214,21 +404,6 @@
:: "b" (data), "c" (buf)
: "%esi", "%edi", "%eax", "%edx"
);
-#else
- /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
- for( i=0; i < 128; i++) {
- /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
- buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]);
- buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i]));
- }
-
- /* Bit reversed shuffling */
- for(i=0; i<128; i++) {
- k = bit_reverse_512[i];
- if (k < i)
- swap_cmplx(&buf[i],&buf[k]);
- }
-#endif
/* FFT Merge */
@@ -258,7 +433,6 @@
}
*/
-#ifdef HAVE_SSE
// Note w[0][0]={1,0}
asm volatile(
"xorps %%xmm1, %%xmm1 \n\t"
@@ -279,21 +453,8 @@
:: "g" (buf), "r" (buf + 128)
: "%esi"
);
-#else
- for(i = 0; i < 128; i += 2) {
- tmp_a_r = buf[i].real;
- tmp_a_i = buf[i].imag;
- tmp_b_r = buf[i+1].real;
- tmp_b_i = buf[i+1].imag;
- buf[i].real = tmp_a_r + tmp_b_r;
- buf[i].imag = tmp_a_i + tmp_b_i;
- buf[i+1].real = tmp_a_r - tmp_b_r;
- buf[i+1].imag = tmp_a_i - tmp_b_i;
- }
-#endif
// Note w[1]={{1,0}, {0,-1}}
-#ifdef HAVE_SSE
asm volatile(
"movaps ps111_1, %%xmm7 \n\t" // 1,1,1,-1
"movl %0, %%esi \n\t"
@@ -314,28 +475,7 @@
:: "g" (buf), "r" (buf + 128)
: "%esi"
);
-#else
- for(i = 0; i < 128; i += 4) {
- tmp_a_r = buf[i].real;
- tmp_a_i = buf[i].imag;
- tmp_b_r = buf[i+2].real;
- tmp_b_i = buf[i+2].imag;
- buf[i].real = tmp_a_r + tmp_b_r;
- buf[i].imag = tmp_a_i + tmp_b_i;
- buf[i+2].real = tmp_a_r - tmp_b_r;
- buf[i+2].imag = tmp_a_i - tmp_b_i;
- tmp_a_r = buf[i+1].real;
- tmp_a_i = buf[i+1].imag;
- tmp_b_r = buf[i+3].imag;
- tmp_b_i = buf[i+3].real;
- buf[i+1].real = tmp_a_r + tmp_b_r;
- buf[i+1].imag = tmp_a_i - tmp_b_i;
- buf[i+3].real = tmp_a_r - tmp_b_r;
- buf[i+3].imag = tmp_a_i + tmp_b_i;
- }
-#endif
-#ifdef HAVE_SSE
/*
Note sseW2+0={1,1,sqrt(2),sqrt(2))
Note sseW2+16={0,0,sqrt(2),-sqrt(2))
@@ -380,44 +520,7 @@
:: "g" (buf), "r" (buf + 128)
: "%esi"
);
-#else
- for(i = 0; i < 128; i += 8) {
- tmp_a_r = buf[i].real;
- tmp_a_i = buf[i].imag;
- tmp_b_r = buf[i+4].real;
- tmp_b_i = buf[i+4].imag;
- buf[i].real = tmp_a_r + tmp_b_r;
- buf[i].imag = tmp_a_i + tmp_b_i;
- buf[i+4].real = tmp_a_r - tmp_b_r;
- buf[i+4].imag = tmp_a_i - tmp_b_i;
- tmp_a_r = buf[1+i].real;
- tmp_a_i = buf[1+i].imag;
- tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
- tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
- buf[1+i].real = tmp_a_r + tmp_b_r;
- buf[1+i].imag = tmp_a_i + tmp_b_i;
- buf[i+5].real = tmp_a_r - tmp_b_r;
- buf[i+5].imag = tmp_a_i - tmp_b_i;
- tmp_a_r = buf[i+2].real;
- tmp_a_i = buf[i+2].imag;
- tmp_b_r = buf[i+6].imag;
- tmp_b_i = - buf[i+6].real;
- buf[i+2].real = tmp_a_r + tmp_b_r;
- buf[i+2].imag = tmp_a_i + tmp_b_i;
- buf[i+6].real = tmp_a_r - tmp_b_r;
- buf[i+6].imag = tmp_a_i - tmp_b_i;
- tmp_a_r = buf[i+3].real;
- tmp_a_i = buf[i+3].imag;
- tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
- tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
- buf[i+3].real = tmp_a_r + tmp_b_r;
- buf[i+3].imag = tmp_a_i + tmp_b_i;
- buf[i+7].real = tmp_a_r - tmp_b_r;
- buf[i+7].imag = tmp_a_i - tmp_b_i;
- }
-#endif
-#ifdef HAVE_SSE
for (m=3; m < 7; m++) {
two_m = (1 << m);
two_m_plus_one = two_m<<1;
@@ -452,28 +555,6 @@
);
}
-#else
- for (m=3; m < 7; m++) {
- two_m = (1 << m);
-
- two_m_plus_one = two_m<<1;
-
- for(i = 0; i < 128; i += two_m_plus_one) {
- for(k = 0; k < two_m; k++) {
- int p = k + i;
- int q = p + two_m;
- tmp_a_r = buf[p].real;
- tmp_a_i = buf[p].imag;
- tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
- tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
- buf[p].real = tmp_a_r + tmp_b_r;
- buf[p].imag = tmp_a_i + tmp_b_i;
- buf[q].real = tmp_a_r - tmp_b_r;
- buf[q].imag = tmp_a_i - tmp_b_i;
- }
- }
- }
-#endif
/* Post IFFT complex multiply plus IFFT complex conjugate*/
for( i=0; i < 128; i++) {
@@ -489,7 +570,6 @@
window_ptr = imdct_window;
/* Window and convert to real valued signal */
-#ifdef HAVE_SSE
asm volatile(
"xorl %%edi, %%edi \n\t" // 0
"xorl %%esi, %%esi \n\t" // 0
@@ -516,14 +596,7 @@
data_ptr+=128;
delay_ptr+=128;
// window_ptr+=128;
-#else
- for(i=0; i< 64; i++) {
- *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
- }
-#endif
-
-#ifdef HAVE_SSE
+
asm volatile(
"movl $1024, %%edi \n\t" // 512
"xorl %%esi, %%esi \n\t" // 0
@@ -549,17 +622,10 @@
);
data_ptr+=128;
// window_ptr+=128;
-#else
- for(i=0; i< 64; i++) {
- *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
- }
-#endif
/* The trailing edge of the window goes into the delay line */
delay_ptr = delay;
-#ifdef HAVE_SSE
asm volatile(
"xorl %%edi, %%edi \n\t" // 0
"xorl %%esi, %%esi \n\t" // 0
@@ -581,14 +647,7 @@
);
delay_ptr+=128;
// window_ptr-=128;
-#else
- for(i=0; i< 64; i++) {
- *delay_ptr++ = -buf[64+i].real * *--window_ptr;
- *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
- }
-#endif
-
-#ifdef HAVE_SSE
+
asm volatile(
"movl $1024, %%edi \n\t" // 1024
"xorl %%esi, %%esi \n\t" // 0
@@ -608,13 +667,8 @@
:: "r" (buf), "r" (delay_ptr)
: "%esi", "%edi"
);
-#else
- for(i=0; i<64; i++) {
- *delay_ptr++ = buf[i].imag * *--window_ptr;
- *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
- }
-#endif
}
+#endif //arch_x86
void
imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
@@ -756,14 +810,15 @@
{
int i, j, k;
- fprintf (stderr, "No accelerated IMDCT transform found\n");
+ if(gCpuCaps.hasSSE) fprintf (stderr, "Using SSE optimized IMDCT transform\n");
+ else fprintf (stderr, "No accelerated IMDCT transform found\n");
/* Twiddle factors to turn IFFT into IMDCT */
for (i = 0; i < 128; i++) {
xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
}
-#ifdef HAVE_SSE
+#ifdef ARCH_X86
for (i = 0; i < 128; i++) {
sseSinCos1a[2*i+0]= -xsin1[i];
sseSinCos1a[2*i+1]= -xcos1[i];
@@ -785,7 +840,7 @@
w[i][k].imag = sin (-M_PI * k / j);
}
}
-#ifdef HAVE_SSE
+#ifdef ARCH_X86
for (i = 1; i < 7; i++) {
j = 1 << i;
for (k = 0; k < j; k+=2) {
@@ -828,9 +883,10 @@
sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
}
-#endif
-
- imdct_512 = imdct_do_512;
+#endif // arch_x86
+
+ if(gCpuCaps.hasSSE) imdct_512 = imdct_do_512_sse;
+ else imdct_512 = imdct_do_512;
imdct_256 = imdct_do_256;
}
}
Index: Makefile
===================================================================
RCS file: /cvsroot/mplayer/main/liba52/Makefile,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- Makefile 14 Dec 2001 20:14:59 -0000 1.3
+++ Makefile 18 Dec 2001 04:00:29 -0000 1.4
@@ -19,7 +19,7 @@
$(AR) r $(LIBNAME) $(OBJS)
test: $(LIBNAME) test.c
- $(CC) $(CFLAGS) test.c -o test -L. -la52
+ $(CC) $(CFLAGS) test.c ../cpudetect.c -o test -L. -la52 -lm
test2: $(LIBNAME) test.c
$(CC) $(CFLAGS) test.c -o test2 -L../libac3 -lac3 -L. -la52
Index: test.c
===================================================================
RCS file: /cvsroot/mplayer/main/liba52/test.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- test.c 17 Dec 2001 17:21:50 -0000 1.4
+++ test.c 18 Dec 2001 04:00:29 -0000 1.5
@@ -11,6 +11,7 @@
#include <inttypes.h>
#include "a52.h"
+#include "../cpudetect.h"
static sample_t * samples;
static a52_state_t state;
@@ -46,6 +47,11 @@
long long t, sum=0, min=256*256*256*64;
#endif
+ FILE *temp= stdout;
+ stdout= stderr; //EVIL HACK FIXME
+ GetCpuCaps(&gCpuCaps);
+ stdout= temp;
+
samples = a52_init (accel);
if (samples == NULL) {
fprintf (stderr, "A52 init failed\n");
@@ -81,7 +87,7 @@
buf_size=0;
// decode:
- flags=A52_STEREO; // A52_DOLBY // A52_2F2R // A52_3F2R | A52_LFE
+ flags=A52_STEREO; //A52_STEREO; // A52_DOLBY // A52_2F2R // A52_3F2R | A52_LFE
channels=2;
flags |= A52_ADJUST_LEVEL;
@@ -92,15 +98,15 @@
// a52_dynrng (&state, NULL, NULL); // disable dynamic range compensation
+STARTTIMING
a52_resample_init(flags,channels);
s16 = out_buf;
for (i = 0; i < 6; i++) {
-STARTTIMING
if (a52_block (&state, samples))
{ fprintf(stderr,"error at sampling\n"); break; }
-ENDTIMING
// float->int + channels interleaving:
s16+=a52_resample(samples,s16);
+ENDTIMING
}
#ifdef TIMING
if(sum<min) min=sum;
More information about the MPlayer-cvslog
mailing list