[Mplayer-cvslog] CVS: main/postproc swscale.c,1.49,1.50 swscale_template.c,1.51,1.52
Michael Niedermayer
michael at mplayer.dev.hu
Thu Dec 6 20:07:33 CET 2001
Update of /cvsroot/mplayer/main/postproc
In directory mplayer:/var/tmp.root/cvs-serv6124
Modified Files:
swscale.c swscale_template.c
Log Message:
print more info if -v
use new horizontal mmx scaler instead of old x86asm if mmx2 cant be used (FAST_BILINEAR only)
fixed overflow in init function ... using double precission fp now :)
using C scaler for the last 1-2 lines if there is a chance to write over the end of the dst array
Index: swscale.c
===================================================================
RCS file: /cvsroot/mplayer/main/postproc/swscale.c,v
retrieving revision 1.49
retrieving revision 1.50
diff -u -r1.49 -r1.50
--- swscale.c 6 Dec 2001 00:10:42 -0000 1.49
+++ swscale.c 6 Dec 2001 19:07:25 -0000 1.50
@@ -31,14 +31,14 @@
//#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
#define ASSERT(x) ;
-
+extern int verbose; // defined in mplayer.c
/*
NOTES
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
-horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
+horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
-Supported output formats BGR15 BGR16 BGR24 BGR32, YV12
+Supported output formats BGR15 BGR16 BGR24 BGR32 YV12
BGR15 & BGR16 MMX verions support dithering
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
@@ -49,6 +49,7 @@
Move static / global vars into a struct so multiple scalers can be used
write special vertical cubic upscale version
Optimize C code (yv12 / minmax)
+dstStride[3]
*/
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
@@ -183,6 +184,203 @@
}
#endif
+static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
+ int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+ uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
+{
+ //FIXME Optimize (just quickly writen not opti..)
+ int i;
+ for(i=0; i<dstW; i++)
+ {
+ int val=0;
+ int j;
+ for(j=0; j<lumFilterSize; j++)
+ val += lumSrc[j][i] * lumFilter[j];
+
+ dest[i]= MIN(MAX(val>>19, 0), 255);
+ }
+
+ if(uDest != NULL)
+ for(i=0; i<(dstW>>1); i++)
+ {
+ int u=0;
+ int v=0;
+ int j;
+ for(j=0; j<lumFilterSize; j++)
+ {
+ u += chrSrc[j][i] * chrFilter[j];
+ v += chrSrc[j][i + 2048] * chrFilter[j];
+ }
+
+ uDest[i]= MIN(MAX(u>>19, 0), 255);
+ vDest[i]= MIN(MAX(v>>19, 0), 255);
+ }
+}
+
+static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
+ int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+ uint8_t *dest, int dstW, int dstbpp)
+{
+ if(dstbpp==32)
+ {
+ int i;
+ for(i=0; i<(dstW>>1); i++){
+ int j;
+ int Y1=0;
+ int Y2=0;
+ int U=0;
+ int V=0;
+ int Cb, Cr, Cg;
+ for(j=0; j<lumFilterSize; j++)
+ {
+ Y1 += lumSrc[j][2*i] * lumFilter[j];
+ Y2 += lumSrc[j][2*i+1] * lumFilter[j];
+ }
+ for(j=0; j<chrFilterSize; j++)
+ {
+ U += chrSrc[j][i] * chrFilter[j];
+ V += chrSrc[j][i+2048] * chrFilter[j];
+ }
+ Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
+ Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
+ U >>= 19;
+ V >>= 19;
+
+ Cb= clip_yuvtab_40cf[U+ 256];
+ Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
+ Cr= clip_yuvtab_3343[V+ 256];
+
+ dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
+ dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
+ dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
+
+ dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
+ dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
+ dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
+ }
+ }
+ else if(dstbpp==24)
+ {
+ int i;
+ for(i=0; i<(dstW>>1); i++){
+ int j;
+ int Y1=0;
+ int Y2=0;
+ int U=0;
+ int V=0;
+ int Cb, Cr, Cg;
+ for(j=0; j<lumFilterSize; j++)
+ {
+ Y1 += lumSrc[j][2*i] * lumFilter[j];
+ Y2 += lumSrc[j][2*i+1] * lumFilter[j];
+ }
+ for(j=0; j<chrFilterSize; j++)
+ {
+ U += chrSrc[j][i] * chrFilter[j];
+ V += chrSrc[j][i+2048] * chrFilter[j];
+ }
+ Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
+ Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
+ U >>= 19;
+ V >>= 19;
+
+ Cb= clip_yuvtab_40cf[U+ 256];
+ Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
+ Cr= clip_yuvtab_3343[V+ 256];
+
+ dest[0]=clip_table[((Y1 + Cb) >>13)];
+ dest[1]=clip_table[((Y1 + Cg) >>13)];
+ dest[2]=clip_table[((Y1 + Cr) >>13)];
+
+ dest[3]=clip_table[((Y2 + Cb) >>13)];
+ dest[4]=clip_table[((Y2 + Cg) >>13)];
+ dest[5]=clip_table[((Y2 + Cr) >>13)];
+ dest+=6;
+ }
+ }
+ else if(dstbpp==16)
+ {
+ int i;
+ for(i=0; i<(dstW>>1); i++){
+ int j;
+ int Y1=0;
+ int Y2=0;
+ int U=0;
+ int V=0;
+ int Cb, Cr, Cg;
+ for(j=0; j<lumFilterSize; j++)
+ {
+ Y1 += lumSrc[j][2*i] * lumFilter[j];
+ Y2 += lumSrc[j][2*i+1] * lumFilter[j];
+ }
+ for(j=0; j<chrFilterSize; j++)
+ {
+ U += chrSrc[j][i] * chrFilter[j];
+ V += chrSrc[j][i+2048] * chrFilter[j];
+ }
+ Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
+ Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
+ U >>= 19;
+ V >>= 19;
+
+ Cb= clip_yuvtab_40cf[U+ 256];
+ Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
+ Cr= clip_yuvtab_3343[V+ 256];
+
+ ((uint16_t*)dest)[2*i] =
+ clip_table16b[(Y1 + Cb) >>13] |
+ clip_table16g[(Y1 + Cg) >>13] |
+ clip_table16r[(Y1 + Cr) >>13];
+
+ ((uint16_t*)dest)[2*i+1] =
+ clip_table16b[(Y2 + Cb) >>13] |
+ clip_table16g[(Y2 + Cg) >>13] |
+ clip_table16r[(Y2 + Cr) >>13];
+ }
+ }
+ else if(dstbpp==15)
+ {
+ int i;
+ for(i=0; i<(dstW>>1); i++){
+ int j;
+ int Y1=0;
+ int Y2=0;
+ int U=0;
+ int V=0;
+ int Cb, Cr, Cg;
+ for(j=0; j<lumFilterSize; j++)
+ {
+ Y1 += lumSrc[j][2*i] * lumFilter[j];
+ Y2 += lumSrc[j][2*i+1] * lumFilter[j];
+ }
+ for(j=0; j<chrFilterSize; j++)
+ {
+ U += chrSrc[j][i] * chrFilter[j];
+ V += chrSrc[j][i+2048] * chrFilter[j];
+ }
+ Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
+ Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
+ U >>= 19;
+ V >>= 19;
+
+ Cb= clip_yuvtab_40cf[U+ 256];
+ Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
+ Cr= clip_yuvtab_3343[V+ 256];
+
+ ((uint16_t*)dest)[2*i] =
+ clip_table15b[(Y1 + Cb) >>13] |
+ clip_table15g[(Y1 + Cg) >>13] |
+ clip_table15r[(Y1 + Cr) >>13];
+
+ ((uint16_t*)dest)[2*i+1] =
+ clip_table15b[(Y2 + Cb) >>13] |
+ clip_table15g[(Y2 + Cg) >>13] |
+ clip_table15r[(Y2 + Cr) >>13];
+ }
+ }
+}
+
+
//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
//Plain C versions
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
@@ -270,7 +468,6 @@
// *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
// *** Note: it's called multiple times while decoding a frame, first time y==0
-// *** Designed to upscale, but may work for downscale too.
// switching the cpu type during a sliced drawing can have bad effects, like sig11
void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int srcSliceY ,
int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
Index: swscale_template.c
===================================================================
RCS file: /cvsroot/mplayer/main/postproc/swscale_template.c,v
retrieving revision 1.51
retrieving revision 1.52
diff -u -r1.51 -r1.52
--- swscale_template.c 6 Dec 2001 01:23:23 -0000 1.51
+++ swscale_template.c 6 Dec 2001 19:07:25 -0000 1.52
@@ -672,33 +672,9 @@
: "%eax", "%edx", "%esi"
);
#else
- //FIXME Optimize (just quickly writen not opti..)
- int i;
- for(i=0; i<dstW; i++)
- {
- int val=0;
- int j;
- for(j=0; j<lumFilterSize; j++)
- val += lumSrc[j][i] * lumFilter[j];
-
- dest[i]= MIN(MAX(val>>19, 0), 255);
- }
-
- if(uDest != NULL)
- for(i=0; i<(dstW>>1); i++)
- {
- int u=0;
- int v=0;
- int j;
- for(j=0; j<lumFilterSize; j++)
- {
- u += chrSrc[j][i] * chrFilter[j];
- v += chrSrc[j][i + 2048] * chrFilter[j];
- }
-
- uDest[i]= MIN(MAX(u>>19, 0), 255);
- vDest[i]= MIN(MAX(v>>19, 0), 255);
- }
+yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
+ chrFilter, chrSrc, chrFilterSize,
+ dest, uDest, vDest, dstW);
#endif
}
@@ -836,163 +812,10 @@
);
}
#else
- if(dstbpp==32)
- {
- int i;
- for(i=0; i<(dstW>>1); i++){
- int j;
- int Y1=0;
- int Y2=0;
- int U=0;
- int V=0;
- int Cb, Cr, Cg;
- for(j=0; j<lumFilterSize; j++)
- {
- Y1 += lumSrc[j][2*i] * lumFilter[j];
- Y2 += lumSrc[j][2*i+1] * lumFilter[j];
- }
- for(j=0; j<chrFilterSize; j++)
- {
- U += chrSrc[j][i] * chrFilter[j];
- V += chrSrc[j][i+2048] * chrFilter[j];
- }
- Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
- Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
- U >>= 19;
- V >>= 19;
-
- Cb= clip_yuvtab_40cf[U+ 256];
- Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
- Cr= clip_yuvtab_3343[V+ 256];
-
- dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
- dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
- dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
-
- dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
- dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
- dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
- }
- }
- else if(dstbpp==24)
- {
- int i;
- for(i=0; i<(dstW>>1); i++){
- int j;
- int Y1=0;
- int Y2=0;
- int U=0;
- int V=0;
- int Cb, Cr, Cg;
- for(j=0; j<lumFilterSize; j++)
- {
- Y1 += lumSrc[j][2*i] * lumFilter[j];
- Y2 += lumSrc[j][2*i+1] * lumFilter[j];
- }
- for(j=0; j<chrFilterSize; j++)
- {
- U += chrSrc[j][i] * chrFilter[j];
- V += chrSrc[j][i+2048] * chrFilter[j];
- }
- Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
- Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
- U >>= 19;
- V >>= 19;
-
- Cb= clip_yuvtab_40cf[U+ 256];
- Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
- Cr= clip_yuvtab_3343[V+ 256];
-
- dest[0]=clip_table[((Y1 + Cb) >>13)];
- dest[1]=clip_table[((Y1 + Cg) >>13)];
- dest[2]=clip_table[((Y1 + Cr) >>13)];
-
- dest[3]=clip_table[((Y2 + Cb) >>13)];
- dest[4]=clip_table[((Y2 + Cg) >>13)];
- dest[5]=clip_table[((Y2 + Cr) >>13)];
- dest+=6;
- }
- }
- else if(dstbpp==16)
- {
- int i;
- for(i=0; i<(dstW>>1); i++){
- int j;
- int Y1=0;
- int Y2=0;
- int U=0;
- int V=0;
- int Cb, Cr, Cg;
- for(j=0; j<lumFilterSize; j++)
- {
- Y1 += lumSrc[j][2*i] * lumFilter[j];
- Y2 += lumSrc[j][2*i+1] * lumFilter[j];
- }
- for(j=0; j<chrFilterSize; j++)
- {
- U += chrSrc[j][i] * chrFilter[j];
- V += chrSrc[j][i+2048] * chrFilter[j];
- }
- Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
- Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
- U >>= 19;
- V >>= 19;
-
- Cb= clip_yuvtab_40cf[U+ 256];
- Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
- Cr= clip_yuvtab_3343[V+ 256];
-
- ((uint16_t*)dest)[2*i] =
- clip_table16b[(Y1 + Cb) >>13] |
- clip_table16g[(Y1 + Cg) >>13] |
- clip_table16r[(Y1 + Cr) >>13];
-
- ((uint16_t*)dest)[2*i+1] =
- clip_table16b[(Y2 + Cb) >>13] |
- clip_table16g[(Y2 + Cg) >>13] |
- clip_table16r[(Y2 + Cr) >>13];
- }
- }
- else if(dstbpp==15)
- {
- int i;
- for(i=0; i<(dstW>>1); i++){
- int j;
- int Y1=0;
- int Y2=0;
- int U=0;
- int V=0;
- int Cb, Cr, Cg;
- for(j=0; j<lumFilterSize; j++)
- {
- Y1 += lumSrc[j][2*i] * lumFilter[j];
- Y2 += lumSrc[j][2*i+1] * lumFilter[j];
- }
- for(j=0; j<chrFilterSize; j++)
- {
- U += chrSrc[j][i] * chrFilter[j];
- V += chrSrc[j][i+2048] * chrFilter[j];
- }
- Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
- Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
- U >>= 19;
- V >>= 19;
-
- Cb= clip_yuvtab_40cf[U+ 256];
- Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
- Cr= clip_yuvtab_3343[V+ 256];
-
- ((uint16_t*)dest)[2*i] =
- clip_table15b[(Y1 + Cb) >>13] |
- clip_table15g[(Y1 + Cg) >>13] |
- clip_table15r[(Y1 + Cr) >>13];
+yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
+ chrFilter, chrSrc, chrFilterSize,
+ dest, dstW, dstbpp);
- ((uint16_t*)dest)[2*i+1] =
- clip_table15b[(Y2 + Cb) >>13] |
- clip_table15g[(Y2 + Cg) >>13] |
- clip_table15r[(Y2 + Cr) >>13];
- }
- }
#endif
} //!FULL_UV_IPOL
}
@@ -1373,7 +1196,6 @@
uint8_t *dest, int dstW, int uvalpha, int dstbpp)
{
int uvalpha1=uvalpha^4095;
- const int yalpha=0;
const int yalpha1=0;
if(fullUVIpol || allwaysIpol)
@@ -1636,7 +1458,7 @@
"movd %%mm0, (%4, %%ebp) \n\t"
"addl $4, %%ebp \n\t"
" jnc 1b \n\t"
-
+
"popl %%ebp \n\t"
: "+a" (counter)
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
@@ -1764,7 +1586,12 @@
// *** horizontal scale Y line to temp buffer
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc)
{
+#ifdef HAVE_MMX
+ // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
+ if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
+#else
if(sws_flags != SWS_FAST_BILINEAR)
+#endif
{
RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
}
@@ -1885,7 +1712,12 @@
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth,
uint8_t *src1, uint8_t *src2, int srcW, int xInc)
{
+#ifdef HAVE_MMX
+ // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
+ if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
+#else
if(sws_flags != SWS_FAST_BILINEAR)
+#endif
{
RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
@@ -2026,12 +1858,13 @@
}
}
-static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *filterSize, int xInc,
+static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
int srcW, int dstW, int filterAlign, int one)
{
int i;
+ double filter[8000];
#ifdef HAVE_MMX
- asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
+ asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
#endif
if(ABS(xInc - 0x10000) <10) // unscaled
@@ -2066,14 +1899,13 @@
if(sws_flags == SWS_BICUBIC)
{
double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
-// int coeff;
- int y1,y2,y3,y4;
+ double y1,y2,y3,y4;
double A= -0.75;
// Equation is from VirtualDub
- y1 = (int)floor(0.5 + ( + A*d - 2.0*A*d*d + A*d*d*d) * 16384.0);
- y2 = (int)floor(0.5 + (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d) * 16384.0);
- y3 = (int)floor(0.5 + ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d) * 16384.0);
- y4 = (int)floor(0.5 + ( + A*d*d - A*d*d*d) * 16384.0);
+ y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
+ y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
+ y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
+ y4 = ( + A*d*d - A*d*d*d);
// printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
filter[i*(*filterSize) + 0]= y1;
@@ -2087,8 +1919,7 @@
for(j=0; j<*filterSize; j++)
{
double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
- int coeff;
- coeff= (int)(0.5 + (1.0 - d)*(1<<14));
+ double coeff= 1.0 - d;
if(coeff<0) coeff=0;
// printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
filter[i*(*filterSize) + j]= coeff;
@@ -2116,24 +1947,22 @@
for(j=0; j<*filterSize; j++)
{
double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
- int coeff;
+ double coeff;
if(sws_flags == SWS_BICUBIC)
{
double A= -0.75;
// d*=2;
// Equation is from VirtualDub
if(d<1.0)
- coeff = (int)floor(0.5 + (1.0 - (A+3.0)*d*d
- + (A+2.0)*d*d*d) * (1<<14));
+ coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
else if(d<2.0)
- coeff = (int)floor(0.5 + (-4.0*A + 8.0*A*d
- - 5.0*A*d*d + A*d*d*d) * (1<<14));
+ coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
else
- coeff=0;
+ coeff=0.0;
}
else
{
- coeff= (int)(0.5 + (1.0 - d)*(1<<14));
+ coeff= 1.0 - d;
if(coeff<0) coeff=0;
}
// if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
@@ -2160,17 +1989,17 @@
filterPos[i]= 0;
}
- if(filterPos[i] + *filterSize > srcW)
+ if(filterPos[i] + (*filterSize) > srcW)
{
- int shift= filterPos[i] + *filterSize - srcW;
+ int shift= filterPos[i] + (*filterSize) - srcW;
// Move filter coeffs right to compensate for filterPos
- for(j=*filterSize-2; j>=0; j--)
+ for(j=(*filterSize)-2; j>=0; j--)
{
- int right= MIN(j + shift, *filterSize-1);
+ int right= MIN(j + shift, (*filterSize)-1);
filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
filter[i*(*filterSize) +j]=0;
}
- filterPos[i]= srcW - *filterSize;
+ filterPos[i]= srcW - (*filterSize);
}
}
@@ -2190,7 +2019,7 @@
scale/= sum;
for(j=0; j<*filterSize; j++)
{
- filter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
+ dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
}
}
}
@@ -2339,17 +2168,29 @@
static int firstTime=1;
-int widthAlign= dstbpp==12 ? 16 : 8;
-if(((dstW + widthAlign-1)&(~(widthAlign-1))) > dststride)
+const int widthAlign= dstbpp==12 ? 16 : 8;
+const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
+const int over= dstbpp==12 ? (((dstW+15)&(~15))) - dststride
+ : (((dstW+7)&(~7)))*bytespp - dststride;
+if(dststride%widthAlign !=0 )
{
- dstW&= ~(widthAlign-1);
if(firstTime)
fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
- "SwScaler: ->lowering width to compensate, new width=%d\n"
- "SwScaler: ->cannot do aligned memory acesses anymore\n",
- widthAlign, dstW);
+ "SwScaler: ->cannot do aligned memory acesses anymore\n",
+ widthAlign);
}
+if(over>0)
+{
+ if(firstTime)
+ fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n"
+ "SwScaler: and dststride is not large enough to handle %d extra bytes\n"
+ "SwScaler: ->using unoptimized C version for last line(s)\n",
+ over);
+}
+
+
+
//printf("%d %d %d %d\n", srcW, srcH, dstW, dstH);
//printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH);
@@ -2357,9 +2198,11 @@
canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR)
{
- if(firstTime) //FIXME only if verbose ?
+ if(firstTime)
fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
}
+#else
+canMMX2BeUsed=0; // should be 0 anyway but ...
#endif
if(firstTime)
@@ -2398,7 +2241,7 @@
#elif defined (HAVE_MMX)
fprintf(stderr, "using MMX\n");
#elif defined (ARCH_X86)
- fprintf(stderr, "using X86 ASM2\n");
+ fprintf(stderr, "using X86 ASM\n");
#else
fprintf(stderr, "using C\n");
#endif
@@ -2413,13 +2256,15 @@
if(sws_flags==SWS_FAST_BILINEAR)
{
if(canMMX2BeUsed) lumXInc+= 20;
+#ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available
else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
+#endif
}
if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW;
-else chrXInc= lumXInc, chrDstW= dstW>>1;
+else chrXInc= lumXInc, chrDstW= (dstW+1)>>1;
-if(dstbpp==12) chrYInc= lumYInc, chrDstH= dstH>>1;
+if(dstbpp==12) chrYInc= lumYInc, chrDstH= (dstH+1)>>1;
else chrYInc= lumYInc>>1, chrDstH= dstH;
// force calculation of the horizontal interpolation of the first line
@@ -2440,13 +2285,10 @@
#endif
oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags;
- if(sws_flags != SWS_FAST_BILINEAR)
- {
- RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
- srcW , dstW , filterAlign, 1<<14);
- RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
- srcW>>1, chrDstW, filterAlign, 1<<14);
- }
+ RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
+ srcW , dstW , filterAlign, 1<<14);
+ RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
+ (srcW+1)>>1, chrDstW, filterAlign, 1<<14);
#ifdef HAVE_MMX2
// cant downscale !!!
@@ -2470,7 +2312,7 @@
RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc,
srcH , dstH, 1, (1<<12)-4);
RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc,
- srcH>>1, chrDstH, 1, (1<<12)-4);
+ (srcH+1)>>1, chrDstH, 1, (1<<12)-4);
// Calculate Buffer Sizes so that they wont run out while handling these damn slices
vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize;
@@ -2509,6 +2351,74 @@
#endif
}
+ if(firstTime && verbose)
+ {
+#ifdef HAVE_MMX2
+ int mmx2=1;
+#else
+ int mmx2=0;
+#endif
+#ifdef HAVE_MMX
+ int mmx=1;
+#else
+ int mmx=0;
+#endif
+
+#ifdef HAVE_MMX
+ if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR)
+ printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
+ else
+ {
+ if(hLumFilterSize==4)
+ printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
+ else if(hLumFilterSize==8)
+ printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
+ else
+ printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
+
+ if(hChrFilterSize==4)
+ printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
+ else if(hChrFilterSize==8)
+ printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
+ else
+ printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
+ }
+#elif defined (ARCH_X86)
+ printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
+#else
+ if(sws_flags==SWS_FAST_BILINEAR)
+ printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
+ else
+ printf("SwScaler: using C scaler for horizontal scaling\n");
+#endif
+
+ if(dstbpp==12)
+ {
+ if(vLumFilterSize==1)
+ printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
+ else
+ printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
+ }
+ else
+ {
+ if(vLumFilterSize==1 && vChrFilterSize==2)
+ printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
+ "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C");
+ else if(vLumFilterSize==2 && vChrFilterSize==2)
+ printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
+ else
+ printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
+ }
+
+ if(dstbpp==24)
+ printf("SwScaler: using %s YV12->BGR24 Converter\n",
+ mmx2 ? "MMX2" : (mmx ? "MMX" : "C"));
+ else
+ printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp);
+
+ printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
+ }
+
lastInLumBuf= -1;
lastInChrBuf= -1;
} // if(firstLine)
@@ -2557,7 +2467,7 @@
ASSERT(chrBufIndex < 2*vChrBufSize)
ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
- RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc);
+ RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
lastInChrBuf++;
}
//wrap buf index around to stay inside the ring buffer
@@ -2590,7 +2500,7 @@
ASSERT(chrBufIndex < 2*vChrBufSize)
ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
- RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc);
+ RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
lastInChrBuf++;
}
//wrap buf index around to stay inside the ring buffer
@@ -2605,7 +2515,8 @@
g5Dither= dither8[dstY&1];
r5Dither= dither8[(dstY+1)&1];
#endif
-
+ if(dstY < dstH-2 || over<=0)
+ {
if(dstbpp==12) //YV12
{
if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
@@ -2657,6 +2568,29 @@
lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
}
}
+ }
+ else // hmm looks like we cant use MMX here without overwriting this arrays tail
+ {
+ int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+ int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+ if(dstbpp==12) //YV12
+ {
+ if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
+ yuv2yuvXinC(
+ vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
+ vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+ dest, uDest, vDest, dstW);
+ }
+ else
+ {
+ ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
+ ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
+ yuv2rgbXinC(
+ vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+ vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+ dest, dstW, dstbpp);
+ }
+ }
}
#ifdef HAVE_MMX
More information about the MPlayer-cvslog
mailing list