[MPlayer-dev-eng] [PATCH] Make mp3lib SIMD optimizations work on AMD64, Part 5

Zuxy Meng zuxy.meng at gmail.com
Fri May 25 08:14:49 CEST 2007


Hi,

2007/5/20, Zuxy Meng <zuxy.meng at gmail.com>:
> As discussed with Guillaume on IRC, I'll split my previous big patch
> (Rewrite synth_1to1_MMX....) into several small parts for easier
> review. Here's the first one, rewriting the generic code in
> synth_1to1_MMX from assembly to C, so we don't need to deal with
> different ABIs. I've tested it and confirmed it doesn't hurt
> performance.
>
> Note I removed a conditional jump in the remaining assembly too. By
> analyzing the code I'm sure it's never taken so don't worry about
> that. Strictly speaking it should be in a seperate patch but then this
> patch would break mplayer...
>
> Part 2 will replace 32-bit leal to equivalent add/sub (without the 'l'
> suffix) so pointer arithmetic will be 64-bit under amd64.
>
> Part 3 will remove hardcoded registers.
>
> Part 4 will kill tabinit_mmx.c. We don't need to compute the table at
> runtime; it can be predetermined.
>
> Part 5 will correct data types, replacing 'long' with 'int' where necessary.
>
> The last patch will deal with Makefile and macros.

We've reached Part 5. This is again a big diff, replacing many 'long'
to 'int', and several 'real' to 'short'. Among those, many are
necessary because under LP64 environment, sizeof(long) != sizeof(int)
and sizeof(long) != sizeof(float); others are optimizations simply
because we really don't need such a long type, or the declaration
didn't match its use (declared as float, used as short).

-- 
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: mp3lib/decode_i586.c
===================================================================
--- mp3lib/decode_i586.c	?????? 23382??
+++ mp3lib/decode_i586.c	????????????
@@ -33,9 +33,9 @@
 #include "mangle.h"
 #define real float /* ugly - but only way */
 
-static long attribute_used buffs[1088]={0};
-static long attribute_used bo=1;
-static long attribute_used saved_ebp=0;
+static int attribute_used buffs[1088]={0};
+static int attribute_used bo=1;
+static int attribute_used saved_ebp=0;
 
 int synth_1to1_pent(real *bandPtr, int channel, short *samples)
 {
Index: mp3lib/dct64_sse.c
===================================================================
--- mp3lib/dct64_sse.c	?????? 23382??
+++ mp3lib/dct64_sse.c	????????????
@@ -30,15 +18,12 @@
 static const int nnnn[4] __attribute__((aligned(16))) =
 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
 
-void dct64_sse(real *a,real *b,real *c)
+void dct64_sse(short *out0,short *out1,real *c)
 {
     static real __attribute__ ((aligned(16))) b1[0x20];
     static real __attribute__ ((aligned(16))) b2[0x20];
     static real const one = 1.f;
 
-    short *out0 = (short*)a;
-    short *out1 = (short*)b;
-
     {
         real *costab = costab_mmx;
         int i;
@@ -428,7 +413,7 @@
         "fistp 416(%4)\n\t"
         ".byte 0xdf, 0xc0\n\t" // ffreep %%st(0)
         :
-        :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(a), "r"(b)
+        :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1)
         :"memory"
         );
 #endif
Index: mp3lib/dct64_3dnow.c
===================================================================
--- mp3lib/dct64_3dnow.c	?????? 23382??
+++ mp3lib/dct64_3dnow.c	????????????
@@ -15,7 +15,7 @@
 static unsigned long long int attribute_used __attribute__((aligned(8))) x_plus_minus_3dnow = 0x8000000000000000ULL;
 static float attribute_used plus_1f = 1.0;
 
-void dct64_MMX_3dnow(real *a,real *b,real *c)
+void dct64_MMX_3dnow(short *a,short *b,real *c)
 {
   char tmp[256];
     __asm __volatile(
Index: mp3lib/layer3.c
===================================================================
--- mp3lib/layer3.c	?????? 23382??
+++ mp3lib/layer3.c	????????????
@@ -324,7 +324,7 @@
  * read additional side information (for MPEG 1 and MPEG 2)
  */
 static int III_get_side_info(struct III_sideinfo *si,int stereo,
- int ms_stereo,long sfreq,int single,int lsf)
+ int ms_stereo,int sfreq,int single,int lsf)
 {
    int ch, gr;
    int powdiff = (single == 3) ? 4 : 0;
@@ -568,10 +568,10 @@
  * Dequantize samples (includes huffman decoding)
  */
 /* 24 is enough because tab13 has max. a 19 bit huffvector */
-#define BITSHIFT ((sizeof(long)-1)*8)
+#define BITSHIFT ((sizeof(int)-1)*8)
 #define REFRESH_MASK \
   while(num < BITSHIFT) { \
-    mask |= ((unsigned long)getbyte())<<(BITSHIFT-num); \
+    mask |= ((unsigned)getbyte())<<(BITSHIFT-num); \
     num += 8; \
     part2remain -= 8; }
 
@@ -585,9 +585,9 @@
   int *me;
 
   int num=getbitoffset();
-  long mask;
+  int mask;
   /* we must split this, because for num==0 the shift is undefined if you do it in one step */
-  mask  = ((unsigned long) getbits(num))<<BITSHIFT;
+  mask  = ((unsigned) getbits(num))<<BITSHIFT;
   mask <<= 8-num;
   part2remain -= num;
 
@@ -672,7 +672,7 @@
         if(x == 15 && h->linbits) {
           max[lwin] = cb;
           REFRESH_MASK;
-          x += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits);
+          x += ((unsigned) mask) >> (BITSHIFT+8-h->linbits);
           num -= h->linbits+1;
           mask <<= h->linbits;
           if(mask < 0)
@@ -696,7 +696,7 @@
         if(y == 15 && h->linbits) {
           max[lwin] = cb;
           REFRESH_MASK;
-          y += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits);
+          y += ((unsigned) mask) >> (BITSHIFT+8-h->linbits);
           num -= h->linbits+1;
           mask <<= h->linbits;
           if(mask < 0)
@@ -850,7 +850,7 @@
         if (x == 15 && h->linbits) {
           max = cb;
 	  REFRESH_MASK;
-          x += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits);
+          x += ((unsigned) mask) >> (BITSHIFT+8-h->linbits);
           num -= h->linbits+1;
           mask <<= h->linbits;
           if(mask < 0)
@@ -874,7 +874,7 @@
         if (y == 15 && h->linbits) {
           max = cb;
 	  REFRESH_MASK;
-          y += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits);
+          y += ((unsigned) mask) >> (BITSHIFT+8-h->linbits);
           num -= h->linbits+1;
           mask <<= h->linbits;
           if(mask < 0)
@@ -1260,11 +1260,11 @@
 
   granules = (fr->lsf) ? 1 : 2;
   for (gr=0;gr<granules;gr++){
     static real hybridIn[2][SBLIMIT][SSLIMIT];
     static real hybridOut[2][SSLIMIT][SBLIMIT];
 
     { struct gr_info_s *gr_info = &(sideinfo.ch[0].gr[gr]);
-      long part2bits;
+      int part2bits;
       if(fr->lsf)
         part2bits = III_get_scale_factors_2(scalefacs[0],gr_info,0);
       else
@@ -1276,7 +1276,7 @@
     if(stereo == 2) {
       struct gr_info_s *gr_info = &(sideinfo.ch[1].gr[gr]);
       
-      long part2bits;
+      int part2bits;
       if(fr->lsf) 
         part2bits = III_get_scale_factors_2(scalefacs[1],gr_info,i_stereo);
       else
Index: mp3lib/decode_MMX.c
===================================================================
--- mp3lib/decode_MMX.c	?????? 23383??
+++ mp3lib/decode_MMX.c	????????????
@@ -14,7 +14,7 @@
 extern void (*dct64_MMX_func)(short*, short*, real*);
 static unsigned long long attribute_used __attribute__((aligned(8))) null_one = 0x0000ffff0000ffffULL;
 static unsigned long long attribute_used __attribute__((aligned(8))) one_null = 0xffff0000ffff0000ULL;
-unsigned long __attribute__((aligned(16))) costab_mmx[] =
+unsigned int __attribute__((aligned(16))) costab_mmx[] =
 {
 	1056974725,
 	1057056395,
Index: mp3lib/dct64_MMX.c
===================================================================
--- mp3lib/dct64_MMX.c	?????? 23382??
+++ mp3lib/dct64_MMX.c	????????????
@@ -6,7 +6,7 @@
 #include "mangle.h"
 #define real float /* ugly - but only way */
 
-void dct64_MMX(real *a,real *b,real *c)
+void dct64_MMX(short *a,short *b,real *c)
 {
     char tmp[256];
     __asm __volatile(
Index: mp3lib/sr1.c
===================================================================
--- mp3lib/sr1.c	?????? 23383??
+++ mp3lib/sr1.c	????????????
@@ -50,7 +50,6 @@
 int MP3_channels=0;
 int MP3_bps=2;
 
-static long outscale = 32768;
 #include "tabinit.c"
 
 #if 1
@@ -108,11 +107,11 @@
      {0,8,16,24,32,40,48,56,64,80,96,112,128,144,160,} }
 };
 
-static long freqs[9] = { 44100, 48000, 32000, 22050, 24000, 16000 , 11025 , 12000 , 8000 };
+static int freqs[9] = { 44100, 48000, 32000, 22050, 24000, 16000 , 11025 , 12000 , 8000 };
 
 LOCAL unsigned int getbits(short number_of_bits)
 {
-  unsigned long rval;
+  unsigned int rval;
 //  if(MP3_frames>=7741) printf("getbits: bits=%d  bitsleft=%d  wordptr=%x\n",number_of_bits,bitsleft,wordpointer);
   if((bitsleft-=number_of_bits)<0) return 0;
   if(!number_of_bits) return 0;
@@ -133,7 +132,7 @@
 
 LOCAL unsigned int getbits_fast(short number_of_bits)
 {
-  unsigned long rval;
+  unsigned int rval;
 //  if(MP3_frames>=7741) printf("getbits_fast: bits=%d  bitsleft=%d  wordptr=%x\n",number_of_bits,bitsleft,wordpointer);
   if((bitsleft-=number_of_bits)<0) return 0;
   if(!number_of_bits) return 0;
@@ -167,7 +166,7 @@
   return ((rval>>7)&1);
 }
 
-LOCAL void set_pointer(long backstep)
+LOCAL void set_pointer(int backstep)
 {
 //  if(backstep!=512 && backstep>fsizeold)
 //    printf("\rWarning! backstep (%d>%d)                                         \n",backstep,fsizeold);
@@ -178,10 +177,10 @@
 //  printf("Backstep %d  (bitsleft=%d)\n",backstep,bitsleft);
 }
 
-LOCAL int stream_head_read(unsigned char *hbuf,unsigned long *newhead){
+LOCAL int stream_head_read(unsigned char *hbuf,unsigned *newhead){
   if(mp3_read(hbuf,4) != 4) return FALSE;
 #if defined(CAN_COMPILE_X86_ASM)
-  *newhead = bswap_32(*((unsigned long *)hbuf));
+  *newhead = bswap_32(*((unsigned*)hbuf));
 #else
   /*
    * we may not be able to address unaligned 32-bit data on non-x86 cpus.
@@ -196,8 +195,8 @@
   return TRUE;
 }
 
-LOCAL int stream_head_shift(unsigned char *hbuf,unsigned long *head){
-  *((unsigned long *)hbuf) >>= 8;
+LOCAL int stream_head_shift(unsigned char *hbuf,unsigned *head){
+  *((unsigned*)hbuf) >>= 8;
   if(mp3_read(hbuf+3,1) != 1) return 0;
   *head <<= 8;
   *head |= hbuf[3];
@@ -208,7 +207,7 @@
  * decode a header and write the information
  * into the frame structure
  */
-LOCAL int decode_header(struct frame *fr,unsigned long newhead){
+LOCAL int decode_header(struct frame *fr,unsigned newhead){
 
     // head_check:
     if( (newhead & 0xffe00000) != 0xffe00000 ||  
@@ -217,8 +216,8 @@
     fr->lay = 4-((newhead>>17)&3);
 //    if(fr->lay!=3) return FALSE;
 
-    if( newhead & ((long)1<<20) ) {
-      fr->lsf = (newhead & ((long)1<<19)) ? 0x0 : 0x1;
+    if( newhead & (1<<20) ) {
+      fr->lsf = (newhead & (1<<19)) ? 0x0 : 0x1;
       fr->mpeg25 = 0;
     } else {
       fr->lsf = 1;
@@ -253,7 +252,7 @@
   case 2:
     MP3_bitrate=tabsel_123[fr->lsf][1][fr->bitrate_index];
     MP3_samplerate=freqs[fr->sampling_frequency];
-    fr->framesize = (long) MP3_bitrate * 144000;
+    fr->framesize = MP3_bitrate * 144000;
     fr->framesize /= MP3_samplerate;
     MP3_framesize=fr->framesize;
     fr->framesize += fr->padding - 4;
@@ -267,7 +266,7 @@
 
     MP3_bitrate=tabsel_123[fr->lsf][2][fr->bitrate_index];
     MP3_samplerate=freqs[fr->sampling_frequency];
-    fr->framesize  = (long) MP3_bitrate * 144000;
+    fr->framesize  = MP3_bitrate * 144000;
     fr->framesize /= MP3_samplerate<<(fr->lsf);
     MP3_framesize=fr->framesize;
     fr->framesize += fr->padding - 4;
@@ -276,7 +275,7 @@
 //    fr->jsbound = (fr->mode == MPG_MD_JOINT_STEREO) ? (fr->mode_ext<<2)+4 : 32;
     MP3_bitrate=tabsel_123[fr->lsf][0][fr->bitrate_index];
     MP3_samplerate=freqs[fr->sampling_frequency];
-    fr->framesize  = (long) MP3_bitrate * 12000;
+    fr->framesize  = MP3_bitrate * 12000;
     fr->framesize /= MP3_samplerate;
     MP3_framesize  = ((fr->framesize+fr->padding)<<2);
     fr->framesize  = MP3_framesize-4;
@@ -314,10 +313,10 @@
  * read next frame     return number of frames read.
  */
 LOCAL int read_frame(struct frame *fr){
-  unsigned long newhead;
+  unsigned newhead;
   union {
     unsigned char buf[8];
-    unsigned long dummy; // for alignment
+    unsigned dummy; // for alignment
   } hbuf;
   int skipped,resyncpos;
   int frames=0;
@@ -393,11 +392,11 @@
 /******************************************************************************/
 
 /* It's hidden from gcc in assembler */
-extern void dct64_MMX(real *, real *, real *);
-extern void dct64_MMX_3dnow(real *, real *, real *);
-extern void dct64_MMX_3dnowex(real *, real *, real *);
-extern void dct64_sse(real *, real *, real *);
-void (*dct64_MMX_func)(real *, real *, real *);
+extern void dct64_MMX(short *, short *, real *);
+extern void dct64_MMX_3dnow(short *, short *, real *);
+extern void dct64_MMX_3dnowex(short *, short *, real *);
+extern void dct64_sse(short *, short *, real *);
+void (*dct64_MMX_func)(short *, short *, real *);
 
 #include "cpudetect.h"
 
@@ -413,7 +412,7 @@
     _has_mmx = 0;
     dct36_func = dct36;
 
-    make_decode_tables(outscale);
+    make_decode_tables();
 
 #ifdef CAN_COMPILE_X86_ASM
 
Index: mp3lib/mpg123.h
===================================================================
--- mp3lib/mpg123.h	?????? 23382??
+++ mp3lib/mpg123.h	????????????
@@ -71,7 +71,7 @@
     int lay;
     int error_protection;
     int bitrate_index;
-    long sampling_frequency;
+    int sampling_frequency;
     int padding;
     int extension;
     int mode;
@@ -79,7 +79,7 @@
     int copyright;
          int original;
          int emphasis;
-         long framesize; /* computed framesize */
+         int framesize; /* computed framesize */
 };
 
 
@@ -117,7 +117,7 @@
 extern real *mp3lib_pnts[];
 
 extern int synth_1to1_pent( real *,int,short * );
-extern void make_decode_tables_MMX(long scaleval);
+extern void make_decode_tables_MMX(void);
 extern int synth_1to1_MMX( real *,int,short * );
 extern int synth_1to1_MMX_s(real *, int, short *, short *, int *);
 
Index: mp3lib/dct64_k7.c
===================================================================
--- mp3lib/dct64_k7.c	?????? 23382??
+++ mp3lib/dct64_k7.c	????????????
@@ -15,7 +15,7 @@
 static unsigned long long int attribute_used __attribute__((aligned(8))) x_plus_minus_3dnow = 0x8000000000000000ULL;
 static float attribute_used plus_1f = 1.0;
 
-void dct64_MMX_3dnowex(real *a,real *b,real *c)
+void dct64_MMX_3dnowex(short *a,short *b,real *c)
 {
   char tmp[256];
     __asm __volatile(
Index: mp3lib/tabinit.c
===================================================================
--- mp3lib/tabinit.c	?????? 23382??
+++ mp3lib/tabinit.c	????????????
@@ -8,7 +8,7 @@
 static real cos64[32], cos32[16], cos16[8], cos8[4], cos4[2];
 real *mp3lib_pnts[]={ cos64,cos32,cos16,cos8,cos4 };
 
-static long intwinbase[] = {
+static int intwinbase[] = {
      0,    -1,    -1,    -1,    -1,    -1,    -1,    -2,    -2,    -2,
     -2,    -3,    -3,    -4,    -4,    -5,    -5,    -6,    -7,    -7,
     -8,    -9,   -10,   -11,   -13,   -14,   -16,   -17,   -19,   -21,
@@ -36,9 +36,9 @@
  64019, 65290, 66494, 67629, 68692, 69679, 70590, 71420, 72169, 72835,
  73415, 73908, 74313, 74630, 74856, 74992, 75038 };
 
-static void make_decode_tables(long scaleval)
+static void make_decode_tables()
 {
-  int i,j,k,kr,divv;
+  int i,j,k,kr,divv,scaleval=32768;
   real *table,*costab;
 
 


More information about the MPlayer-dev-eng mailing list