[MPlayer-dev-eng] [PATCH] mp3lib: Align output so that movaps can be used instead of movups
Zuxy Meng
zuxy.meng at gmail.com
Mon Jun 4 06:51:44 CEST 2007
Hi,
The attached patch aligns output address of dct64 functions to 16-byte
boundary, so movaps can be used instead of movups in dct64_sse,
resulting in 2.5% faster decode.
I tested against mp3 and mp2 and both worked fine; couldn't find an
mp1 sample though.
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: mp3lib/layer2.c
===================================================================
--- mp3lib/layer2.c ?????? 23458??
+++ mp3lib/layer2.c ????????????
@@ -285,7 +285,7 @@
int clip=0;
int i,j;
int stereo = fr->stereo;
- real fraction[2][4][SBLIMIT]; /* pick_table clears unused subbands */
+ real __attribute__((aligned(16))) fraction[2][4][SBLIMIT]; /* pick_table clears unused subbands */
unsigned int bit_alloc[64];
int scale[192];
int single = fr->single;
Index: mp3lib/dct64_sse.c
===================================================================
--- mp3lib/dct64_sse.c ?????? 23459??
+++ mp3lib/dct64_sse.c ????????????
@@ -5,18 +5,6 @@
* and mp3lib/dct64_MMX.c
*/
-/* NOTE: The following code is suboptimal! It can be improved (at least) by
-
- 1. Replace all movups by movaps. (Can Parameter c be always aligned on
- a 16-byte boundary?)
-
- 2. Rewritten using intrinsics. (GCC generally optimizes intrinsics
- better. However, when __m128 locals are involved, GCC may
- produce bad code that uses movaps to access a stack not aligned
- on a 16-byte boundary, which leads to run-time crashes.)
-
-*/
-
typedef float real;
extern float __attribute__((aligned(16))) costab_mmx[];
@@ -32,8 +20,8 @@
void dct64_sse(short *out0,short *out1,real *c)
{
- static real __attribute__ ((aligned(16))) b1[0x20];
- static real __attribute__ ((aligned(16))) b2[0x20];
+ real __attribute__ ((aligned(16))) b1[0x20];
+ real __attribute__ ((aligned(16))) b2[0x20];
static real const one = 1.f;
{
@@ -45,9 +33,9 @@
asm(
"movaps %2, %%xmm3\n\t"
"shufps $27, %%xmm3, %%xmm3\n\t"
- "movups %3, %%xmm1\n\t"
+ "movaps %3, %%xmm1\n\t"
"movaps %%xmm1, %%xmm4\n\t"
- "movups %4, %%xmm2\n\t"
+ "movaps %4, %%xmm2\n\t"
"shufps $27, %%xmm4, %%xmm4\n\t"
"movaps %%xmm2, %%xmm0\n\t"
"shufps $27, %%xmm0, %%xmm0\n\t"
Index: mp3lib/layer3.c
===================================================================
--- mp3lib/layer3.c ?????? 23458??
+++ mp3lib/layer3.c ????????????
@@ -1260,8 +1260,8 @@
granules = (fr->lsf) ? 1 : 2;
for (gr=0;gr<granules;gr++){
- static real hybridIn[2][SBLIMIT][SSLIMIT];
- static real hybridOut[2][SSLIMIT][SBLIMIT];
+ real __attribute__((aligned(16))) hybridIn[2][SBLIMIT][SSLIMIT];
+ real __attribute__((aligned(16))) hybridOut[2][SSLIMIT][SBLIMIT];
{ struct gr_info_s *gr_info = &(sideinfo.ch[0].gr[gr]);
int part2bits;
Index: mp3lib/layer1.c
===================================================================
--- mp3lib/layer1.c ?????? 23458??
+++ mp3lib/layer1.c ????????????
@@ -131,7 +131,7 @@
int i,stereo = fr->stereo;
unsigned int balloc[2*SBLIMIT];
unsigned int scale_index[2][SBLIMIT];
- real fraction[2][SBLIMIT];
+ real __attribute__((aligned(16))) fraction[2][SBLIMIT];
// int single = fr->single;
// printf("do_layer1(0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X )\n",
More information about the MPlayer-dev-eng
mailing list