[Mplayer-cvslog] CVS: main/libmp1e/video dct.c,NONE,1.1 dct.h,NONE,1.1 dct_ieee.h,NONE,1.1 dct_mmx.s,NONE,1.1 dct_ref.c,NONE,1.1 filter.c,NONE,1.1 filter_mmx.s,NONE,1.1 libvideo.h,NONE,1.1 mblock.c,NONE,1.1 mblock.h,NONE,1.1 motion.c,NONE,1.1 motion.h,NONE,1.1 motion_mmx.s,NONE,1.1 motion_sse2.s,NONE,1.1 mpeg.h,NONE,1.1 mpeg1.c,NONE,1.1 tables.c,NONE,1.1 video.h,NONE,1.1 vlc.c,NONE,1.1 vlc.h,NONE,1.1 vlc_mmx.s,NONE,1.1

Wed Dec 5 00:58:13 CET 2001

Update of /cvsroot/mplayer/main/libmp1e/video
In directory mplayer:/var/tmp.root/cvs-serv18843

Added Files:
	dct.c dct.h dct_ieee.h dct_mmx.s dct_ref.c filter.c 
	filter_mmx.s libvideo.h mblock.c mblock.h motion.c motion.h 
	motion_mmx.s motion_sse2.s mpeg.h mpeg1.c tables.c video.h 
	vlc.c vlc.h vlc_mmx.s 
Log Message:

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: dct.c,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#include <assert.h>
#include "../common/math.h"
#include "../common/mmx.h"
#include "dct.h"
#include "dct_ieee.h"
#include "mpeg.h"
#include "video.h"

// static char sh1[8] = { 15, 14, 13, 13, 12, 12, 12, 11 };
// static char sh2[8] = { 16, 14, 13, 13, 13, 12, 12, 12 };

/*
 *  ((q > 16) ? q & ~1 : q) == ((ltp[q] * 2 + 1) << lts[q])
 */
char ltp[32] __attribute__ ((aligned (MIN(32, CACHE_LINE)))) = {
    0, 0, 0, 1, 0, 2, 1, 3, 0, 4, 2, 5, 1, 6, 3, 7,
    0, 0, 4, 4, 2, 2, 5, 5, 1, 1, 6, 6, 3, 3, 7, 7,
};
char lts[32] __attribute__ ((aligned (MIN(32, CACHE_LINE)))) = { 
    0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
    4, 4, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 2, 2, 1, 1,
};

#if __GNUC_MINOR__ < 90
#define align(n)
#else
#define align(n) __attribute__ ((SECTION("video_tables") aligned (n)))
#endif

mmx_t c0 align(8);
mmx_t c1 align(8);
mmx_t c2 align(8);
mmx_t c4 align(8);
mmx_t c1_15 align(8);
mmx_t c1_16 align(8);
mmx_t c1_17 align(8);
mmx_t c1_15w align(8);
mmx_t c128 align(8);
mmx_t c255 align(8);
mmx_t c128_6 align(8);
mmx_t c1b align(8);
mmx_t cC4_14 align(8);
mmx_t cC4_15 align(8);
mmx_t c1C6_13 align(8);
mmx_t cC4C6_14 align(8);
mmx_t cC2C61_13 align(8);
mmx_t cC2626_15 align(8);
mmx_t cC6262_15 align(8);
mmx_t c256 align(8);
mmx_t mm8, mm9;
mmx_t c2q align(8);

char	mmx_q_fdct_intra_sh[32]		align(MIN(CACHE_LINE,32));
short	mmx_q_fdct_intra_q_lut[8][8][8]	align(CACHE_LINE);
short	mmx_q_fdct_inter_lut[6][8][2]	align(CACHE_LINE);
short	mmx_q_fdct_inter_lut0[8][1]	align(CACHE_LINE);
short	mmx_q_fdct_inter_q[32]		align(CACHE_LINE);
mmx_t	mmx_q_idct_inter_tab[16]	align(CACHE_LINE);
short	mmx_q_idct_intra_q_lut[8][8][8]	align(CACHE_LINE);

mmx_t cfae;	// fdct_inter temp
mmx_t csh;	// "
mmx_t crnd;	// "
mmx_t c1x;	// idct_intra2 temp
mmx_t shift, mask, mask0; // "

#define R2 sqrt(2.0)

static void init_dct(void) __attribute__ ((constructor));

static void
init_dct(void)
{
	double mmx_inter_lut[8][8];
	int q, v, u, sh, max;
        double Cu, Cv, m;
	int shq[8];

	/* Constants used throughout the video encoder */

	c0		= MMXRW(0);
	c1		= MMXRW(1);
	c2		= MMXRW(2);
	c4		= MMXRW(4);

	c1_15		= MMXRD(1 << 15);
	c1_16		= MMXRD(1 << 16);
	c1_17		= MMXRD(1 << 17);
	c1_15w		= MMXRW(0x8000);

	c128		= MMXRW(128);
	c255		= MMXRW(255);
	c128_6		= MMXRW(128 << 6);
	c1b		= MMXRB(1);
	c256		= MMXRW(1 << 8);

	cC4_14		= MMXRW(lroundn(C4 * S14));
	cC4_15		= MMXRW(lroundn(C4 * S15));

	c1C6_13		= MMXRW(lroundn((1.0 / C6) * S13));
	cC4C6_14	= MMXRW(lroundn((C4 / C6) * S14));
	cC2C61_13	= MMXRW(lroundn((C2 / C6 - 1) * S13));
	cC2626_15	= MMXW(lroundn(C2 * S15), -lroundn(C6 * S15), lroundn(C2 * S15), -lroundn(C6 * S15));
	cC6262_15	= MMXW(lroundn(C6 * S15), +lroundn(C2 * S15), lroundn(C6 * S15), +lroundn(C2 * S15));

//	c3a = MMXRW(0);
//	c5a = MMXRW(128 * 32 + 16);
//	c5b = MMXRW(16);

	c2q.uq = 2ULL;

	mmx_q_idct_inter_tab[0]	 = MMXW(+lroundn(S15*C1/R2), +lroundn(S15*C7/R2), +lroundn(S15*C1/R2), +lroundn(S15*C7/R2));
	mmx_q_idct_inter_tab[1]	 = MMXW(+lroundn(S15*C7/R2), -lroundn(S15*C1/R2), +lroundn(S15*C7/R2), -lroundn(S15*C1/R2));
	mmx_q_idct_inter_tab[2]	 = MMXW(+lroundn(S15*C3/R2), +lroundn(S15*C5/R2), +lroundn(S15*C3/R2), +lroundn(S15*C5/R2));
	mmx_q_idct_inter_tab[3]	 = MMXW(-lroundn(S15*C5/R2), +lroundn(S15*C3/R2), -lroundn(S15*C5/R2), +lroundn(S15*C3/R2));
	mmx_q_idct_inter_tab[4]	 = MMXRW(lroundn(S15*C1/R2));
	mmx_q_idct_inter_tab[5]	 = MMXRW(lroundn(S15*(C7+C1)/R2));
	mmx_q_idct_inter_tab[6]	 = MMXRW(lroundn(S15*(C7-C1)/R2));
	mmx_q_idct_inter_tab[7]	 = MMXRW(lroundn(S15*C5/R2));
	mmx_q_idct_inter_tab[8]	 = MMXRW(lroundn(S15*(C3+C5)/R2));
	mmx_q_idct_inter_tab[9]	 = MMXRW(lroundn(S15*(C3-C5)/R2));
	mmx_q_idct_inter_tab[10] = MMXRD(1024);
	mmx_q_idct_inter_tab[11] = MMXRW((8 << 2) + 2);
	mmx_q_idct_inter_tab[12] = MMXRW(lroundn(S15*C2/R2));
	mmx_q_idct_inter_tab[13] = MMXRW(lroundn(S15*(C6+C2)/R2));
	mmx_q_idct_inter_tab[14] = MMXRW(lroundn(S15*(C6-C2)/R2));
	mmx_q_idct_inter_tab[15] = MMXRW(lroundn(S16*(C6-C2)/R2));

	for (q = 0; q < 8; q++) {
		for (sh = max = 0; max < 16384; sh++)
			for (v = max = 0; v < 8; v++)
				for (u = 0; u < 8; u++) {
					Cu = (u == 0) ? 1 : (cos(u * M_PI / 16.0) * sqrt(2.0));
					Cv = (v == 0) ? 1 : (cos(v * M_PI / 16.0) * sqrt(2.0));

					m = 1.0 / (Cu * Cv * 8.0);

					if (u == 0 || u == 4) m *= 0.125;
					if (u == 2 || u == 6) m *= 0.25;
					if (u & 1) m *= C6 * 0.5;
					if (v == 0 || v == 4) m *= 2;
					if (v == 2 || v == 6) m *= 4;
					if (v & 1) m *= C6 * 8;
					if (u == 0 && v == 0) m = 0;

					mmx_q_fdct_intra_q_lut[q][u][v] = lroundn(
						m
						* 8
						/ default_intra_quant_matrix[v][u]
						/ (2 * q + 1)
						* (double)(1 << sh));

					if (mmx_q_fdct_intra_q_lut[q][v][u] > max)
						max = mmx_q_fdct_intra_q_lut[q][v][u];

					mmx_q_fdct_intra_q_lut[q][0][0] = 0;
				}
		shq[q] = sh;
	}

	for (q = 1; q < 32; q++) {
		int ltsi = lts[q], ltpi = ltp[q];

		mmx_q_fdct_intra_sh[q] = shq[ltpi] + ltsi - 17;
	}

	for (v = 0; v < 8; v++) {
		for (u = 0; u < 8; u++) {
    			Cu = (u == 0) ? 1.0 : (cos(u * M_PI / 16.0) * sqrt(2.0));
			Cv = (v == 0) ? 1.0 : (cos(v * M_PI / 16.0) * sqrt(2.0));

			if (v == 2 || v == 6) Cv = 1.0;

			m = 1.0 / (Cu * Cv * 8.0);

			if (u & 1) m *= C6;
			if (u == 0 || u == 4 || u == 7) m /= 4.0;
			if (u == 2 || u == 5) m /= 2.0;
			if (u == 6) m /= 8.0;

			mmx_inter_lut[v][u] = m;
		}
	}

	for (u = 0; u < 8; u++) {
		mmx_q_fdct_inter_lut0[u][0] = lroundn(mmx_inter_lut[0][u] * S19);
		mmx_q_fdct_inter_lut[1][u][0] = lroundn(mmx_inter_lut[0][u] * +(C2 + C6) * S18);
		mmx_q_fdct_inter_lut[1][u][1] = lroundn(mmx_inter_lut[0][u] * +(C2 - C6) * S18);
		mmx_q_fdct_inter_lut[4][u][0] = lroundn(mmx_inter_lut[0][u] * +(C2 - C6) * S18);
		mmx_q_fdct_inter_lut[4][u][1] = lroundn(mmx_inter_lut[0][u] * -(C2 + C6) * S18);
		mmx_q_fdct_inter_lut[0][u][1] = +(mmx_q_fdct_inter_lut[0][u][0] = lroundn(mmx_inter_lut[1][u] * S19));
		mmx_q_fdct_inter_lut[2][u][1] = -(mmx_q_fdct_inter_lut[2][u][0] = lroundn(mmx_inter_lut[3][u] * S19));
		mmx_q_fdct_inter_lut[3][u][1] = +(mmx_q_fdct_inter_lut[3][u][0] = lroundn(mmx_inter_lut[5][u] * S19));
		mmx_q_fdct_inter_lut[5][u][1] = -(mmx_q_fdct_inter_lut[5][u][0] = lroundn(mmx_inter_lut[7][u] * S17));
	}

	for (q = 0; q < 32; q++)
		mmx_q_fdct_inter_q[q] = lroundn(S15 / q / 2.0);

	for (q = 0; q < 8; q++) {
		for (v = 0; v < 8; v++)
			for (u = 0; u < 8; u++)
				if (u + v == 0)
					mmx_q_idct_intra_q_lut[q][v][u] = 0;
				else
					mmx_q_idct_intra_q_lut[q][v][u] = 
						4 * default_intra_quant_matrix[v][u] * (q * 2 + 1);

//		dump(mmx_q_idct_intra_q_lut[q]);
	}

	c1x = MMXRW(((8 + 128 * 16) << 2) + 2);
}

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: dct.h,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#define reg(n) __attribute__ ((regparm (n)))

extern void		mp1e_fdct_intra(int quant_scale) reg(1);
extern unsigned int	mp1e_fdct_inter(short iblock[6][8][8],
					     int quant_scale) reg(2);
extern void		mp1e_mpeg1_idct_intra(int quant_scale) reg(1);
extern void		mp1e_mpeg1_idct_inter(int quant_scale,
					      unsigned int cbp) reg(2);

extern void		mp1e_mmx_fdct_intra(int quant_scale) reg(1);
extern unsigned int	mp1e_mmx_fdct_inter(short iblock[6][8][8],
						 int quant_scale) reg(2);
extern void		mp1e_mmx_mpeg1_idct_intra(int quant_scale) reg(1);
extern void		mp1e_mmx_mpeg1_idct_intra2(int quant_scale) reg(1);
extern void		mp1e_mmx_mpeg1_idct_inter(int quant_scale,
						  unsigned int cbp) reg(2);

extern void		mp1e_mmx_copy_refblock(void);

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  Based on code written by Tom G. Lane
 *  and released to public domain 11/22/93.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: dct_ieee.h,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>

#define IEEE_PI 3.14159265358979323846

#define C0 cos(0.0 * IEEE_PI / 16.0) // 1.0
#define C1 cos(1.0 * IEEE_PI / 16.0) // 0.9808
#define C2 cos(2.0 * IEEE_PI / 16.0) // 0.9239
#define C3 cos(3.0 * IEEE_PI / 16.0) // 0.8315
#define C4 cos(4.0 * IEEE_PI / 16.0) // 0.7071
#define C5 cos(5.0 * IEEE_PI / 16.0) // 0.5556
#define C6 cos(6.0 * IEEE_PI / 16.0) // 0.3827
#define C7 cos(7.0 * IEEE_PI / 16.0) // 0.1951

#define S13 ((double)(1 << 13))
#define S14 ((double)(1 << 14))
#define S15 ((double)(1 << 15))
#define S16 ((double)(1 << 16))
#define S17 ((double)(1 << 17))
#define S18 ((double)(1 << 18))
#define S19 ((double)(1 << 19))

/*

typedef void dct_func(short [8][8]);
typedef void qdct_func(int q, short [8][8]);

extern int ieee_round(double val);
extern void ieee_ref_fdct(short block[8][8]);
extern void ieee_ref_idct(short block[8][8]);
extern void mpeg_intra_quant(int q, short block[8][8]);
extern void mpeg_inter_quant(int q, short block[8][8]);
extern void mpeg1_intra_iquant(int q, short block[8][8]);
extern void mpeg1_inter_iquant(int q, short block[8][8]);
extern void mpeg2_intra_iquant(int q, short block[8][8]);
extern void mpeg2_inter_iquant(int q, short block[8][8]);
extern void ieee_randomize(short block[8][8], long minpix, long maxpix, long sign);
extern void rake_pattern(short block[8][8], long minpix, long maxpix, long sign);
extern void (* randomize)(short [8][8], long, long, long);
extern void q_fdct_test(qdct_func *fdct, qdct_func *quant, long minpix, long maxpix, long sign, int iterations, unsigned int quant_mask);
extern void q_idct_test(qdct_func *idct, qdct_func *quant, qdct_func *iquant, long minpix, long maxpix, long sign, int iterations, unsigned int quant_mask);
extern void ieee_idct_test(char *name, dct_func *idct, long minpix, long maxpix, long sign, int iterations);
extern void fdct_test(char *name, dct_func *fdct, long minpix, long maxpix, long sign, int iterations);
extern void ieee_1180(char *name, dct_func *idct);

*/

#define __elements(block) (sizeof(block) / sizeof((block)[0][0]))

#define mirror(block)						\
do {								\
	int _i, _j;						\
	for (_i = 0; _i < 7; _i++)				\
		for (_j = _i + 1; _j < 8; _j++)			\
			swap((block)[_i][_j], (block)[_j][_i]);	\
} while (0)

#define trans(block, n)						\
do {								\
	int _i;							\
	for (_i = 0; _i < __elements(block); _i++)		\
		(block)[0][_i] += n;				\
} while (0)

#define copy(d, s)						\
do {								\
	int _i;							\
	for (_i = 0; _i < __elements(d); _i++)			\
		(d)[0][_i] = (s)[0][_i];			\
} while (0)

#define clear(block)						\
do {								\
	int _i;							\
	for (_i = 0; _i < __elements(block); _i++)		\
		(block)[0][_i] = 0.0;				\
} while (0)

#define dump(block)						\
do {								\
	int _i;							\
	int _j = sizeof((block)[0]) / sizeof((block)[0][0]);	\
	fprintf(stderr, #block ":\n");				\
	for (_i = 0; _i < __elements(block); _i++)		\
		fprintf(stderr, "%11.4f%c",			\
			(double)(block)[0][_i],			\
			(_i % _j == _j - 1) ? '\n' : ' ');	\
	fprintf(stderr, "\n");					\
} while (0)

#define peak(block)						\
do {								\
	int _i;							\
	double _min = 1e30, _max = -1e30;			\
	for (_i = 0; _i < __elements(block); _i++)		\
		if ((block)[0][_i] < _min)			\
			_min = (block)[0][_i];			\
		else if ((block)[0][_i] > _max)			\
			_max = (block)[0][_i];			\
	fprintf(stderr, #block ": %11.4f ... %11.4f\n",		\
		_min, _max);					\
} while (0)

#define maxabs(res, bl1, bl2)					\
do {								\
	int _i;							\
	for (_i = 0; _i < __elements(res); _i++)			\
		(res)[0][_i] = MAX(fabs((bl1)[0][_i]),		\
			fabs((bl2)[0][_i]));			\
} while (0)

--- NEW FILE ---
#
#  MPEG-1 Real Time Encoder
# 
#  Copyright (C) 1999-2001 Michael H. Schimek
# 
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License version 2 as
#  published by the Free Software Foundation.
# 
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
# 
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#

[...2415 lines suppressed...]
	movq	5*8(%eax),%mm5;		movq		%mm2,(%edx,%esi);	// 1
	movq	6*8(%eax),%mm6;		packuswb 	%mm5,%mm4;
	movq	7*8(%eax),%mm7;		movq 		%mm4,(%edx,%esi,2);	// 2
	movq	8*8(%eax),%mm0;		packuswb	%mm7,%mm6;
	movq	9*8(%eax),%mm1;		movq		%mm6,(%edi,%esi,2);	// 3
	movq	10*8(%eax),%mm2;	packuswb 	%mm1,%mm0;
	movq	11*8(%eax),%mm3;	movq 		%mm0,(%edx,%esi,4);	// 4
	movq	12*8(%eax),%mm4;	leal		(%edi,%esi,4),%edi;
	movq	13*8(%eax),%mm5;	packuswb	%mm3,%mm2;
	movq	14*8(%eax),%mm6;	movq		%mm2,(%edi);		// 5
	movq	15*8(%eax),%mm7;	packuswb 	%mm5,%mm4;
	leal	128(%eax),%eax;		movq 		%mm4,(%edi,%esi);	// 6
	cmpl	$mb_address+6*8,%ebx;	packuswb	%mm7,%mm6;
	movq	%mm6,(%edi,%esi,2);	jne		1b;			// 7

	popl		%edi;			
	popl		%esi;
	popl		%ebx;			
	ret;

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: dct_ref.c,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#include "dct.h"
#include "mpeg.h"
#include "video.h"
#include "dct_ieee.h"
#include "../common/math.h"

#define FLOAT float

static FLOAT		aan_fwd_lut[8][8];
static FLOAT		aan_inv_lut[8][8];

static void aan_lut_init(void) __attribute__ ((constructor));

static void
aan_lut_init(void)
{
	int v, u;

	for (v = 0; v < 8; v++)
		for (u = 0; u < 8; u++) {
			double Cu, Cv;

    			Cu = (u == 0) ? 1.0 : (cos(u * M_PI / 16.0) * sqrt(2.0));
			Cv = (v == 0) ? 1.0 : (cos(v * M_PI / 16.0) * sqrt(2.0));

			aan_fwd_lut[v][u] = 1.0 / (Cu * Cv * 8.0);
			aan_inv_lut[v][u] = 1.0 * (Cu * Cv / 8.0);
		}
}

static void
aan_double_1d_fdct(FLOAT *in, FLOAT *out)
{
	FLOAT tmp0, tmp1, tmp2, tmp3;
	FLOAT tmp4, tmp5, tmp6, tmp7;

	/* even */

    	tmp0 = in[0] + in[7];
	tmp1 = in[1] + in[6];
	tmp2 = in[2] + in[5];
	tmp3 = in[3] + in[4];

	tmp4 = tmp0 + tmp3;
        tmp5 = tmp1 + tmp2;

	out[0] = tmp4 + tmp5;
        out[4] = tmp4 - tmp5;

	tmp0 -= tmp3;
	tmp1 -= tmp2;

	tmp1 = (tmp0 + tmp1) * C4;

	out[2] = tmp0 + tmp1;
	out[6] = tmp0 - tmp1;

	/* odd */

	tmp7 = in[0] - in[7];
	tmp6 = in[1] - in[6];
	tmp5 = in[2] - in[5];
	tmp4 = in[3] - in[4];

	tmp4 += tmp5;
	tmp5 += tmp6;
        tmp6 += tmp7;

	tmp3 = (tmp4 - tmp6) * C6;
	tmp4 = (tmp4 * (C2 - C6)) + tmp3;
	tmp6 = (tmp6 * (C2 + C6)) + tmp3;
	tmp5 = tmp5 * C4;

	tmp3 = tmp7 + tmp5;
	tmp7 = tmp7 - tmp5;

	out[5] = tmp7 + tmp4;
        out[3] = tmp7 - tmp4;
        out[1] = tmp3 + tmp6;
        out[7] = tmp3 - tmp6;
}

static void
aan_double_1d_idct(FLOAT *in, FLOAT *out)
{
	FLOAT tmp0, tmp1, tmp2, tmp3;
	FLOAT tmp4, tmp5, tmp6, tmp7, tmp8;

	/* odd */

	tmp5 = in[5] + in[3];
	tmp6 = in[5] - in[3];
	tmp8 = in[1] + in[7];
	tmp4 = in[1] - in[7];

	tmp7 = tmp8 + tmp5;
	tmp8 = tmp8 - tmp5;

	tmp5 = +2.0 * C2 * (tmp4 + tmp6);
	tmp6 = -2.0 * (C2 + C6) * tmp6 + tmp5;
	tmp4 = -2.0 * (C2 - C6) * tmp4 + tmp5;
	tmp5 = +2.0 * C4 * tmp8;

	tmp6 -= tmp7;
	tmp5 -= tmp6;
	tmp4 -= tmp5;

	/* even */

	tmp2 = in[2] + in[6];
	tmp8 = in[2] - in[6];

	tmp8 = tmp8 * 2.0 * C4 - tmp2;

	tmp0 = in[0] + in[4];
	tmp1 = in[0] - in[4];

	tmp3 = tmp0 - tmp2;
	tmp0 = tmp0 + tmp2;
	tmp2 = tmp1 - tmp8;
	tmp1 = tmp1 + tmp8;

	out[0] = tmp0 + tmp7;
	out[1] = tmp1 + tmp6;
	out[2] = tmp2 + tmp5;
	out[3] = tmp3 + tmp4;
	out[4] = tmp3 - tmp4;
	out[5] = tmp2 - tmp5;
	out[6] = tmp1 - tmp6;
	out[7] = tmp0 - tmp7;
}

// #define SATURATE(val, min, max) saturate((val), (min), (max))
#define SATURATE(val, min, max) (val)
/*
 *  Saturation in RL/VLC routines with overflow feedback, see there.
 */

void
fdct_intra(int quant_scale)
{
	int i, j, v, u, val, div;

	emms();

	for (i = 0; i < 6; i++) {
		FLOAT F[8][8], t[8][8];

		for (v = 0; v < 64; v++)
			F[0][v] = mblock[0][i][0][v] - 128;

		for (v = 0; v < 8; v++)
			aan_double_1d_fdct(F[v], t[v]);

		mirror(t);

		for (u = 0; u < 8; u++)
			aan_double_1d_fdct(t[u], F[u]);

		mirror(F);

		val = lroundn(F[0][0] * aan_fwd_lut[0][0]);

		mblock[1][i][0][0] = SATURATE((val + 4 * sign(val)) / 8, -255, +255);

		for (j = 1; j < 64; j++) {
			val = lroundn(F[0][j] * aan_fwd_lut[0][j]);
			div = default_intra_quant_matrix[0][j] * quant_scale;

			mblock[1][i][0][j] = SATURATE((8 * val + sign(val) * (div >> 1)) / div, -255, +255);
		}

		mirror(mblock[1][i]);
	}
}

unsigned int
fdct_inter(short iblock[6][8][8], int quant_scale)
{
	int i, j, val, cbp = 0;

	emms();

	for (i = 0; i < 6; i++)	{
		FLOAT F[8][8], t[8][8];

		for (j = 0; j < 64; j++)
			F[0][j] = iblock[i][0][j];

		for (j = 0; j < 8; j++)
			aan_double_1d_fdct(F[j], t[j]);

		mirror(t);

		for (j = 0; j < 8; j++)
			aan_double_1d_fdct(t[j], F[j]);

		mirror(F);

		for (j = 0; j < 64; j++) {
			val = lroundn(F[0][j] * aan_fwd_lut[0][j]);

			if ((mblock[0][i][0][j] = SATURATE(val / (2 * quant_scale), -255, +255)) != 0)
				cbp |= 0x20 >> i;
		}

		mirror(mblock[0][i]);
	}

	return cbp;
}

void
mpeg1_idct_intra(int quant_scale)
{
	int i, j, k, val;
	unsigned char *p, *new = newref;

	emms();

	for (i = 0; i < 6; i++)	{
		FLOAT F[8][8], t[8][8];

		new += mb_address.block[i].offset;

		mirror(mblock[1][i]);

		F[0][0] = mblock[1][i][0][0] * 8 * aan_inv_lut[0][0];

		for (j = 1; j < 64; j++) {
			val = (int)(mblock[1][i][0][j] * 
				default_intra_quant_matrix[0][j] * quant_scale) / 8;

			/* mismatch control */

			if (!(val & 1))
				val -= sign(val);

			F[0][j] = aan_inv_lut[0][j] * saturate(val, -2048, 2047);
		}

		for (j = 0; j < 8; j++)
			aan_double_1d_idct(F[j], t[j]);

		mirror(t);

		for (j = 0; j < 8; j++)
			aan_double_1d_idct(t[j], F[j]);

		mirror(F);

		for (j = 0, p = new; j < 8; j++) {
			for (k = 0; k < 8; k++)
				p[k] = saturate(lroundn(F[j][k]) + 128, 0, 255);
			p += mb_address.block[i].pitch;
		}
	}
}

void
mpeg1_idct_inter(int quant_scale, unsigned int cbp)
{
	FLOAT F[8][8], t[8][8];
	unsigned char *new = newref;
	int i, j, k, val;

	emms();

	for (i = 0; i < 6; i++) {
		new += mb_address.block[i].offset;

		if (cbp & (0x20 >> i)) {
			unsigned char *p = new;

			mirror(mblock[0][i]);

			for (j = 0; j < 64; j++) {
				val = (2 * mblock[0][i][0][j] + sign(mblock[0][i][0][j])) * quant_scale;

				/* mismatch control */

				if (!(val & 1))
					val -= sign(val);

				F[0][j] = aan_inv_lut[0][j] * saturate(val, -2048, 2047);
			}

			for (j = 0; j < 8; j++)
				aan_double_1d_idct(F[j], t[j]);

			mirror(t);

			for (j = 0; j < 8; j++)
				aan_double_1d_idct(t[j], F[j]);

			mirror(F);

			for (j = 0; j < 8; j++) {
				for (k = 0; k < 8; k++)
#if 1
					p[k] = saturate(lroundn(F[j][k]) + mblock[3][i][j][k], 0, 255);
#else
					p[k] = saturate(saturate(lroundn(F[j][k]), -128, 127) + mblock[3][i][j][k], 0, 255);
#endif
				p += mb_address.block[i].pitch;
			}
		} else {
			unsigned char *p = new;

			for (j = 0; j < 8; j++) {
				for (k = 0; k < 8; k++)
					p[k] = mblock[3][i][j][k];
				p += mb_address.block[i].pitch;
			}
		}
	}
}

void
mpeg2_idct_intra(int quant_scale)
{
	int i, j, k, val, sum;
	unsigned char *p, *new = newref;

	emms();

	for (i = 0; i < 6; i++)	{
		FLOAT F[8][8], t[8][8];

		new += mb_address.block[0].offset;

		mirror(mblock[1][i]);

		F[0][0] = (sum = mblock[1][i][0][0] * 8) * aan_inv_lut[0][0];

		for (j = 1; j < 64; j++) {
			val = (int)(mblock[1][i][0][j] * 
				default_intra_quant_matrix[0][j] * quant_scale) / 8;

			sum += val = saturate(val, -2048, 2047);

			if (j == 63 && !(sum & 1))
				val ^= 1;

			F[0][j] = aan_inv_lut[0][j] * val;
		}

		for (j = 0; j < 8; j++)
			aan_double_1d_idct(F[j], t[j]);

		mirror(t);

		for (j = 0; j < 8; j++)
			aan_double_1d_idct(t[j], F[j]);

		mirror(F);

		for (j = 0, p = new; j < 8; j++) {
			for (k = 0; k < 8; k++)
				p[k] = saturate(lroundn(F[j][k]) + 128, 0, 255);
			p += mb_address.block[i].pitch;
		}
	}
}

void
mpeg2_idct_inter(int quant_scale, unsigned int cbp)
{
	FLOAT F[8][8], t[8][8];
	unsigned char *new = newref;
	int i, j, k, val, sum;

	emms();

	for (i = 0; i < 6; i++) {
		new += mb_address.block[0].offset;

		if (cbp & (0x20 >> i)) {
			unsigned char *p = new;

			mirror(mblock[0][i]);

			for (j = 0, sum = 0; j < 64; j++) {
				val = (2 * mblock[0][i][0][j] + sign(mblock[0][i][0][j])) * quant_scale;

				sum += val = saturate(val, -2048, 2047);

				/* mismatch control */

				if (j == 63 && !(sum & 1))
					val ^= 1;

				F[0][j] = aan_inv_lut[0][j] * val;
			}

			for (j = 0; j < 8; j++)
				aan_double_1d_idct(F[j], t[j]);

			mirror(t);

			for (j = 0; j < 8; j++)
				aan_double_1d_idct(t[j], F[j]);

			mirror(F);

			for (j = 0; j < 8; j++) {
				for (k = 0; k < 8; k++)
#if 1
					p[k] = saturate(lroundn(F[j][k]) + mblock[3][i][j][k], 0, 255);
#else
					p[k] = saturate(saturate(lroundn(F[j][k]), -128, 127) + mblock[3][i][j][k], 0, 255);
#endif
				p += mb_address.block[i].pitch;
			}
		} else {
			unsigned char *p = new;

			for (j = 0; j < 8; j++) {
				for (k = 0; k < 8; k++)
					p[k] = mblock[3][i][j][k];
				p += mb_address.block[i].pitch;
			}
		}
	}
}

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: filter.c,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#include "../common/log.h"
#include "../common/mmx.h"
#include "../common/math.h"
#include "../options.h"
#include "video.h"

int			(* filter)(unsigned char *, unsigned char *);
// removed bool			temporal_interpolation;

const char		cbp_order[6] = { 5, 4, 3, 1, 2, 0 };

const char *
filter_labels[] = {
	"invalid",
	"YUV 4:2:0 fastest",
	"YUYV 4:2:2 fastest",
	"YUYV 4:2:2 w/vertical decimation",
	"YUYV 4:2:2 w/temporal interpolation", /* REMOVED */
	"YUYV 4:2:2 w/vertical interpolation",
	"YUYV 4:2:2 field progressive 50/60 Hz",
	"YUYV 4:2:2 50/60 Hz w/temporal interpolation", /* REMOVED */
	"YVU 4:2:0 fastest",
	"",
	"",
	"",
};

/* static */ int	filter_y_offs,
			filter_u_offs,
			filter_v_offs,
			filter_y_pitch;

extern int		mmx_YUV_420(unsigned char *, unsigned char *);
extern int		mmx_YUYV_422(unsigned char *, unsigned char *);
extern int		mmx_YUYV_422_2v(unsigned char *, unsigned char *);
extern int		mmx_YUYV_422_ti(unsigned char *, unsigned char *);
extern int		mmx_YUYV_422_vi(unsigned char *, unsigned char *);

/* Reference */

int
YUYV_422(unsigned char *buffer, unsigned char *unused)
{
	int y, x;
	unsigned int n, s = 0, s2 = 0;

	buffer += filter_y_pitch * mb_row * 16 + mb_col * 16 * 2 + filter_y_offs;

	for (y = 0; y < 16; y++)
		for (x = 0; x < 8; x++) {
			// Note block order Y0 Y2 Y1 Y3
			mblock[0][0][y][x] = (short) buffer[y * filter_y_pitch + x * 2 + 0];
			mblock[0][2][y][x] = (short) buffer[y * filter_y_pitch + x * 2 + 16];
		}

	for (y = 0; y < 8; y++)
		for (x = 0; x < 8; x++) {
			mblock[0][4][y][x] = (short) buffer[y * filter_y_pitch * 2 + x * 4 + 1];
			mblock[0][5][y][x] = (short) buffer[y * filter_y_pitch * 2 + x * 4 + 3];
		}

	for (x = 0; x < 4 * 64; x++) {
		n = mblock[0][0][0][x];
		s += n;
		s2 += n * n;
	}

	return s2 * 256 - (s * s); // luma spatial activity
}

static int (* color_pred)(unsigned char *, unsigned char *);

/* Hum. Could add rendered subpictures. */

static int
color_trap(unsigned char *buffer1, unsigned char *buffer2)
{
	int r = color_pred(buffer1, buffer2);

	asm volatile (
		"\t movq c128,%%mm0;\n"
		"\t movq %%mm0,(%0);	movq %%mm0,1*8(%0);\n"
		"\t movq %%mm0,2*8(%0);	movq %%mm0,3*8(%0);\n"
		"\t movq %%mm0,4*8(%0);	movq %%mm0,5*8(%0);\n"
		"\t movq %%mm0,6*8(%0);	movq %%mm0,7*8(%0);\n"
		"\t movq %%mm0,8*8(%0);	movq %%mm0,9*8(%0);\n"
		"\t movq %%mm0,10*8(%0); movq %%mm0,11*8(%0);\n"
		"\t movq %%mm0,12*8(%0); movq %%mm0,13*8(%0);\n"
		"\t movq %%mm0,14*8(%0); movq %%mm0,15*8(%0);\n"
		"\t movq %%mm0,16*8(%0); movq %%mm0,17*8(%0);\n"
		"\t movq %%mm0,18*8(%0); movq %%mm0,19*8(%0);\n"
		"\t movq %%mm0,20*8(%0); movq %%mm0,21*8(%0);\n"
		"\t movq %%mm0,22*8(%0); movq %%mm0,23*8(%0);\n"
		"\t movq %%mm0,24*8(%0); movq %%mm0,25*8(%0);\n"
		"\t movq %%mm0,26*8(%0); movq %%mm0,27*8(%0);\n"
		"\t movq %%mm0,28*8(%0); movq %%mm0,29*8(%0);\n"
		"\t movq %%mm0,30*8(%0); movq %%mm0,31*8(%0);\n"
	:: "D" (&mblock[0][4][0][0]) : "cc", "memory" FPU_REGS);

	return r;
}

/* Experimental low pass filter */

int
YUYV_422_exp1(unsigned char *buffer, unsigned char *unused)
{
	static const char
	f[5][5] = {
		{ 1,  3,  4,  3, 1 },
		{ 3,  9, 12,  9, 3 },
		{ 4, 12, 16, 12, 4 },
		{ 3,  9, 12,  9, 3 },
		{ 1,  3,  4,  3, 1 },
	};
	unsigned int n, s = 0, s2 = 0;
	int y, x;
	int i, j;

//	if (mb_row <= 0 || mb_row >= mb_last_row)
//		return mmx_YUYV_422(buffer, NULL);

	buffer += filter_y_pitch * mb_row * 16 + mb_col * 16 * 2 + filter_y_offs;

	for (y = 0; y < 16; y++)
		for (x = 0; x < 8; x++) {
			n = 0;
			for (j = 0; j < 5; j++)
				for (i = 0; i < 5; i++)
					n += buffer[(y + j) * filter_y_pitch + (x + i) * 2] * f[j][i];
			mblock[0][0][y][x] = (n + 72) / 144;
			n = 0;
			for (j = 0; j < 5; j++)
				for (i = 0; i < 5; i++)
					n += buffer[(y + j) * filter_y_pitch + (x + i) * 2 + 16] * f[j][i];
			mblock[0][2][y][x] = (n + 72) / 144;
		}

	for (y = 0; y < 8; y++)
		for (x = 0; x < 8; x++) {
			mblock[0][4][y][x] = (short) buffer[y * filter_y_pitch * 2 + x * 4 + 1];
			mblock[0][5][y][x] = (short) buffer[y * filter_y_pitch * 2 + x * 4 + 3];
		}

	for (x = 0; x < 4 * 64; x++) {
		n = mblock[0][0][0][x];
		s += n;
		s2 += n * n;
	}

	return s2 * 256 - (s * s);
}

/* Experimental low pass filter */

int
YUYV_422_exp2(unsigned char *buffer, unsigned char *buffer2)
{
	unsigned int n, s = 0, s2 = 0;
	int y, x;

	x = mmx_YUYV_422(buffer, buffer2);
//	x = mmx_YUYV_422_ti(buffer, buffer2);

//	if (mb_row <= 0 || mb_row >= mb_last_row)
//		return x;
	if (x < 65536 * 128)
		return x;

	buffer += filter_y_pitch * mb_row * 16 + mb_col * 16 * 2 + filter_y_offs;
//	buffer2 += filter_y_pitch * mb_row * 16 + mb_col * 16 * 2 + filter_y_offs;

	for (y = 0; y < 16; y++)
		for (x = 0; x < 8; x++) {
			n =	buffer[(y - 1) * filter_y_pitch + (x - 1) * 2] +
				buffer[(y - 1) * filter_y_pitch + (x + 1) * 2] +
				buffer[(y + 1) * filter_y_pitch + (x - 1) * 2] +
				buffer[(y + 1) * filter_y_pitch + (x + 1) * 2];
			n +=   (buffer[(y - 1) * filter_y_pitch + (x + 0) * 2] +
				buffer[(y + 1) * filter_y_pitch + (x + 0) * 2] +
				buffer[(y + 0) * filter_y_pitch + (x - 1) * 2] +
				buffer[(y + 0) * filter_y_pitch + (x + 1) * 2]) * 2;
			n +=	buffer[(y + 0) * filter_y_pitch + (x + 0) * 2] * 4;
			mblock[0][0][y][x] = (n + 8) >> 4;
			n =	buffer[(y - 1) * filter_y_pitch + (x - 1) * 2 + 16] +
				buffer[(y - 1) * filter_y_pitch + (x + 1) * 2 + 16] +
				buffer[(y + 1) * filter_y_pitch + (x - 1) * 2 + 16] +
				buffer[(y + 1) * filter_y_pitch + (x + 1) * 2 + 16];
			n +=   (buffer[(y - 1) * filter_y_pitch + (x + 0) * 2 + 16] +
				buffer[(y + 1) * filter_y_pitch + (x + 0) * 2 + 16] +
				buffer[(y + 0) * filter_y_pitch + (x - 1) * 2 + 16] +
				buffer[(y + 0) * filter_y_pitch + (x + 1) * 2 + 16]) * 2;
			n +=	buffer[(y + 0) * filter_y_pitch + (x + 0) * 2 + 16] * 4;
			mblock[0][2][y][x] = (n + 8) >> 4;
		}

//	mblock[0][0][0][0] = 0;

	for (y = 0; y < 8; y++)
		for (x = 0; x < 8; x++) {
			n =	buffer[(y - 1) * filter_y_pitch * 2 + (x - 1) * 4 + 1] +
				buffer[(y - 1) * filter_y_pitch * 2 + (x + 1) * 4 + 1] +
				buffer[(y + 1) * filter_y_pitch * 2 + (x - 1) * 4 + 1] +
				buffer[(y + 1) * filter_y_pitch * 2 + (x + 1) * 4 + 1];
			n +=   (buffer[(y - 1) * filter_y_pitch * 2 + (x + 0) * 4 + 1] +
				buffer[(y + 1) * filter_y_pitch * 2 + (x + 0) * 4 + 1] +
				buffer[(y + 0) * filter_y_pitch * 2 + (x - 1) * 4 + 1] +
				buffer[(y + 0) * filter_y_pitch * 2 + (x + 1) * 4 + 1]) * 2;
			n +=	buffer[(y + 0) * filter_y_pitch * 2 + (x + 0) * 4 + 1] * 4;
			mblock[0][4][y][x] = (n + 8) >> 4;
			n =	buffer[(y - 1) * filter_y_pitch * 2 + (x - 1) * 4 + 3] +
				buffer[(y - 1) * filter_y_pitch * 2 + (x + 1) * 4 + 3] +
				buffer[(y + 1) * filter_y_pitch * 2 + (x - 1) * 4 + 3] +
				buffer[(y + 1) * filter_y_pitch * 2 + (x + 1) * 4 + 3];
			n +=   (buffer[(y - 1) * filter_y_pitch * 2 + (x + 0) * 4 + 3] +
				buffer[(y + 1) * filter_y_pitch * 2 + (x + 0) * 4 + 3] +
				buffer[(y + 0) * filter_y_pitch * 2 + (x - 1) * 4 + 3] +
				buffer[(y + 0) * filter_y_pitch * 2 + (x + 1) * 4 + 3]) * 2;
			n +=	buffer[(y + 0) * filter_y_pitch * 2 + (x + 0) * 4 + 3] * 4;
			mblock[0][5][y][x] = (n + 8) >> 4;
		}

	for (x = 0; x < 4 * 64; x++) {
		n = mblock[0][0][0][x];
		s += n;
		s2 += n * n;
	}

	return s2 * 256 - (s * s);
}

/* Experimental low pass filter */

int
YUYV_422_exp3(unsigned char *buffer, unsigned char *buffer2)
{
	static unsigned char temp[19 * 40];
	unsigned int n, s = 0, s2 = 0;
	int y, x;

	buffer += filter_y_pitch * (mb_row * 32 - 1) + mb_col * 16 * 2 + filter_y_offs;
	buffer2 += filter_y_pitch * (mb_row * 32 - 1) + mb_col * 16 * 2 + filter_y_offs;

	for (y = 0; y < 19; y++) {
		for (x = 0; x < 40; x++)
			temp[y * 40 + x] = (buffer[x - 4] + buffer2[x - 4] + 1) >> 1;
		buffer += filter_y_pitch * 2;
		buffer2 += filter_y_pitch * 2;
	}

	for (y = 0; y < 16; y++)
		for (x = 0; x < 8; x++) {
			n =	temp[(y + 0) * 40 + (x + 0) * 2] +
				temp[(y + 0) * 40 + (x + 2) * 2] +
				temp[(y + 2) * 40 + (x + 0) * 2] +
				temp[(y + 2) * 40 + (x + 2) * 2];
			n +=   (temp[(y + 0) * 40 + (x + 1) * 2] +
				temp[(y + 2) * 40 + (x + 1) * 2] +
				temp[(y + 1) * 40 + (x + 0) * 2] +
				temp[(y + 1) * 40 + (x + 2) * 2]) * 2;
			n +=	temp[(y + 1) * 40 + (x + 1) * 2] * 4;
			mblock[0][0][y][x] = (n + 8) >> 4;
			n =	temp[(y + 0) * 40 + (x + 0) * 2 + 16] +
				temp[(y + 0) * 40 + (x + 2) * 2 + 16] +
				temp[(y + 2) * 40 + (x + 0) * 2 + 16] +
				temp[(y + 2) * 40 + (x + 2) * 2 + 16];
			n +=   (temp[(y + 0) * 40 + (x + 1) * 2 + 16] +
				temp[(y + 2) * 40 + (x + 1) * 2 + 16] +
				temp[(y + 1) * 40 + (x + 0) * 2 + 16] +
				temp[(y + 1) * 40 + (x + 2) * 2 + 16]) * 2;
			n +=	temp[(y + 1) * 40 + (x + 1) * 2 + 16] * 4;
			mblock[0][2][y][x] = (n + 8) >> 4;
		}

//	mblock[0][0][0][0] = 0;

	for (y = 0; y < 8; y++)
		for (x = 0; x < 8; x++) {
			n =	temp[(y + 0) * 40 * 2 + (x + 0) * 4 + 1] +
				temp[(y + 0) * 40 * 2 + (x + 2) * 4 + 1] +
				temp[(y + 2) * 40 * 2 + (x + 0) * 4 + 1] +
				temp[(y + 2) * 40 * 2 + (x + 2) * 4 + 1];
			n +=   (temp[(y + 0) * 40 * 2 + (x + 1) * 4 + 1] +
				temp[(y + 2) * 40 * 2 + (x + 1) * 4 + 1] +
				temp[(y + 1) * 40 * 2 + (x + 0) * 4 + 1] +
				temp[(y + 1) * 40 * 2 + (x + 2) * 4 + 1]) * 2;
			n +=	temp[(y + 1) * 40 * 2 + (x + 1) * 4 + 1] * 4;
			mblock[0][4][y][x] = (n + 8) >> 4;
			n =	temp[(y + 0) * 40 * 2 + (x + 0) * 4 + 3] +
				temp[(y + 0) * 40 * 2 + (x + 2) * 4 + 3] +
				temp[(y + 2) * 40 * 2 + (x + 0) * 4 + 3] +
				temp[(y + 2) * 40 * 2 + (x + 2) * 4 + 3];
			n +=   (temp[(y + 0) * 40 * 2 + (x + 1) * 4 + 3] +
				temp[(y + 2) * 40 * 2 + (x + 1) * 4 + 3] +
				temp[(y + 1) * 40 * 2 + (x + 0) * 4 + 3] +
				temp[(y + 1) * 40 * 2 + (x + 2) * 4 + 3]) * 2;
			n +=	temp[(y + 1) * 40 * 2 + (x + 1) * 4 + 3] * 4;
			mblock[0][5][y][x] = (n + 8) >> 4;
		}

	for (x = 0; x < 4 * 64; x++) {
		n = mblock[0][0][0][x];
		s += n;
		s2 += n * n;
	}

	return s2 * 256 - (s * s);
}

/* Experimental ??? filter */

int
YUYV_422_exp4(unsigned char *buffer, unsigned char *unused)
{
	unsigned int n, c, d, r, s = 0, s2 = 0;
	int y, x, i, j;

	buffer += filter_y_pitch * mb_row * 16 + mb_col * 16 * 2 + filter_y_offs;

	for (y = 0; y < 16; y++)
		for (x = 0; x < 8; x++) {
			n = c = 0;
			r = buffer[(y) * filter_y_pitch + (x) * 2];
			for (j = -2; j < +2; j++)
				for (i = -2; i < +2; i++) {
					d = buffer[(y + j) * filter_y_pitch + (x + i) * 2];
					if (40 >= nbabs(d - r)) {
						n += d;
						c++;
					}
				}
			mblock[0][0][y][x] = (n + (c >> 1)) / c;
			n = c = 0;
			r = buffer[(y) * filter_y_pitch + (x) * 2 + 16];
			for (j = -2; j < +2; j++)
				for (i = -2; i < +2; i++) {
					d = buffer[(y + j) * filter_y_pitch + (x + i) * 2 + 16];
					if (40 >= nbabs(d - r)) {
						n += d;
						c++;
					}
				}
			mblock[0][2][y][x] = (n + (c >> 1)) / c;
		}

	for (y = 0; y < 8; y++)
		for (x = 0; x < 8; x++) {
			mblock[0][4][y][x] = (short) buffer[y * filter_y_pitch * 2 + x * 4 + 1];
			mblock[0][5][y][x] = (short) buffer[y * filter_y_pitch * 2 + x * 4 + 3];
		}

	for (x = 0; x < 4 * 64; x++) {
		n = mblock[0][0][0][x];
		s += n;
		s2 += n * n;
	}

	return s2 * 256 - (s * s);
}

/*
 *  Input:
 *  grab_width, grab_height (pixels)
 *  [encoded image] width, height (pixels)
 *  pitch (line distance, Y or YUYV, bytes)
 *
 *  Assumed:
 *  Y plane size = pitch * grab_height,
 *  U,V or V,U - Y distance = 4,5 * Y plane size / 4
 *  U,V pitch = pitch / 2
 *
 *  Output:
 *  width, height (pixels)
 *  filter initialized
 */
void
filter_init(int pitch)
{
	int padded_width, padded_height;
	int y_bpp = 2, scale_x = 1, scale_y = 1;
	int off_x, off_y;
	int uv_size = 0;
	int u = 4, v = 5;

//	temporal_interpolation = FALSE;

	switch (filter_mode) {
	case CM_YVU:
		u = 5; v = 4;
	case CM_YUV:
		filter = mmx_YUV_420;
		uv_size = pitch * grab_height / 4;
		y_bpp = 1;
		break;
	case CM_YUYV:
	case CM_YUYV_PROGRESSIVE:
		filter = mmx_YUYV_422;
		break;

	case CM_YUYV_EXP:
		filter = YUYV_422_exp2;
//		temporal_interpolation = FALSE;
		width = saturate(grab_width, 1, grab_width - 16);
		height = saturate(grab_height, 1, grab_height - 16);
		break;

	case CM_YUYV_EXP2:
		filter = YUYV_422_exp4;
//		temporal_interpolation = FALSE;
		width = saturate(grab_width, 1, grab_width - 16);
		height = saturate(grab_height, 1, grab_height - 16);
		break;

	case CM_YUYV_EXP_VERTICAL_DECIMATION:
		FAIL("Sorry, the selected filter mode was experimental and is no longer available.\n");
		filter = YUYV_422_exp3;
//		temporal_interpolation = TRUE;
		scale_y = 2;
		width = saturate(grab_width, 1, grab_width - 16);
		height = saturate(grab_height / 2, 1, grab_height / 2 - 16);
		break;

	case CM_YUYV_VERTICAL_DECIMATION:
		filter = mmx_YUYV_422_2v;
		scale_y = 2;
		break;

	case CM_YUYV_VERTICAL_INTERPOLATION:
		filter = mmx_YUYV_422_vi;
		break;

	case CM_YUYV_TEMPORAL_INTERPOLATION:
	case CM_YUYV_PROGRESSIVE_TEMPORAL:
		FAIL("Sorry, the selected filter mode (temporal interpolation) is no longer available.\n");
		filter = mmx_YUYV_422_ti;
//		temporal_interpolation = TRUE;
		break;

	default:
		FAIL("Filter '%s' out of order",
			filter_labels[filter_mode]);
	}

	/*
	 *  Need a clipping mechanism (or padded buffers?), currently
	 *  all memory accesses as 16 x 16 mblocks. Step #2: clear outside
	 *  blocks to all zero and all outside samples to average of
	 *  inside samples (for prediction and FDCT).
	 */

	padded_width = ((width + 15) & -16) * scale_x;
	padded_height = ((height + 15) & -16) * scale_y;

	if (padded_width > grab_width) {
		width = (grab_width / scale_x) & -16;
		padded_width = width * scale_x;
	}
	if (padded_height > grab_height) {
		height = (grab_height / scale_y) & -16;
		padded_height = height * scale_y;
	}

	/* Center the encoding window */
	off_x = (grab_width - width * scale_x + 1) >> 1;
	off_y = (grab_height - height * scale_y + 1) >> 1;

	if (off_x + padded_width > grab_width)
		off_x = grab_width - padded_width;
	if (off_y + padded_height > grab_height)
		off_y = grab_height - padded_height;

	filter_y_pitch = pitch;

	filter_y_offs = pitch * off_y + off_x * y_bpp;
	filter_u_offs = uv_size * u + (filter_y_offs >> 2);
	filter_v_offs = uv_size * v + (filter_y_offs >> 2);

	printv(2, "Filter '%s'\n", filter_labels[filter_mode]);

	if (luma_only) {
		color_pred = filter;
		filter = color_trap;
	}
}

--- NEW FILE ---
#
#  MPEG-1 Real Time Encoder
# 
#  Copyright (C) 1999-2000 Michael H. Schimek
# 
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License version 2 as
#  published by the Free Software Foundation.
# 
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
# 
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#

# $Id: filter_mmx.s,v 1.1 2001/12/04 23:58:09 mswitch Exp $

# int
# mmx_YUV_420(unsigned char *buffer, unsigned char *unused)

	.text
	.align		16
	.globl		mmx_YUV_420

mmx_YUV_420:

	leal		-20(%esp),%esp;
	movl		%edi,16(%esp);
	movl		1*4+20(%esp),%edi;	// buffer
	movl		%edx,12(%esp);
	movl		filter_y_pitch,%edx;
	movl		%esi,8(%esp);
	movl		mb_row,%esi;
	movl		%ebx,4(%esp);
	movl		mb_col,%ebx;
	movl		%ecx,0(%esp);
	imull		%edx,%esi;		// row = filter_y_pitch * mb_row
	sall		$2,%esi;
	leal		(%esi,%ebx,8),%eax;
	leal		(%esi,%ebx,4),%esi;
	addl		%edi,%eax;		// chroma = buffer + row * 4 + mb_col * 8
	leal		(%edi,%esi,4),%esi;
	movl		%eax,%ebx;
	addl		filter_u_offs,%eax;	// filter_u_offs + chroma
	addl		filter_v_offs,%ebx;	// filter_v_offs + chroma
	addl		filter_y_offs,%esi;	// filter_y_offs + buffer + row * 16 + mb_col * 16
	movl		$mblock,%edi;

	/* Cb, Cr */

	movq		(%eax),%mm0;			shrl		$1,%edx;
	movq		(%ebx),%mm1;			pxor		%mm7,%mm7;
	movq		(%eax,%edx),%mm4;		movq		%mm0,%mm2;
	movq		(%ebx,%edx),%mm5;		punpcklbw	%mm7,%mm0;
	movq		(%ebx,%edx,2),%mm6;		punpckhbw	%mm7,%mm2;
	movq		%mm0,512+0*16+0(%edi);		movq		%mm1,%mm3;
	movq		(%eax,%edx,2),%mm0;		punpcklbw	%mm7,%mm1;
	movq		%mm2,512+0*16+8(%edi);		punpckhbw	%mm7,%mm3;
	movq		%mm1,640+0*16+0(%edi);		movq		%mm4,%mm2;
	lea		(%ebx,%edx,2),%ebx;		punpcklbw	%mm7,%mm4;
	movq		%mm3,640+0*16+8(%edi);		punpckhbw	%mm7,%mm2;
	lea		(%eax,%edx,2),%eax;		movq		%mm5,%mm3;
	movq		%mm4,512+1*16+0(%edi);		punpcklbw	%mm7,%mm5;
	movq		(%eax,%edx),%mm4;		punpckhbw	%mm7,%mm3;
	movq		%mm2,512+1*16+8(%edi);		lea		(%eax,%edx,2),%eax;
	movq		(%ebx,%edx),%mm2;		lea		(%ebx,%edx,2),%ebx;
	movq		%mm5,640+1*16+0(%edi);		movq		%mm0,%mm5;
	movq		%mm3,640+1*16+8(%edi);		punpcklbw	%mm7,%mm0;
	movq		(%eax),%mm1;			punpckhbw	%mm7,%mm5;
	movq		%mm0,512+2*16+0(%edi);		movq		%mm6,%mm3;
	movq		(%ebx),%mm0;			punpcklbw	%mm7,%mm6;
	movq		%mm5,512+2*16+8(%edi);		punpckhbw	%mm7,%mm3;
	movq		%mm6,640+2*16+0(%edi);		movq		%mm4,%mm5;
	movq		(%eax,%edx),%mm6;		punpcklbw	%mm7,%mm4;
	movq		%mm3,640+2*16+8(%edi);		punpckhbw	%mm7,%mm5;
	movq		%mm4,512+3*16+0(%edi);		movq		%mm2,%mm3;
	movq		(%ebx,%edx),%mm4;		punpcklbw	%mm7,%mm2;
	movq		%mm5,512+3*16+8(%edi);		punpckhbw	%mm7,%mm3;
	movq		%mm2,640+3*16+0(%edi);		movq		%mm1,%mm2;
	movq		%mm3,640+3*16+8(%edi);		punpcklbw	%mm7,%mm1;
	movq		(%eax,%edx,2),%mm3;		punpckhbw	%mm7,%mm2;
	movq		%mm1,512+4*16+0(%edi);		lea		(%eax,%edx,2),%eax;
	movq		(%ebx,%edx,2),%mm1;		movq		%mm0,%mm5;
	lea		(%ebx,%edx,2),%ebx;		punpcklbw	%mm7,%mm0;
	movq		%mm2,512+4*16+8(%edi);		punpckhbw	%mm7,%mm5;
	movq		%mm0,640+4*16+0(%edi);		movq		%mm6,%mm2;
	movq		(%eax,%edx),%mm0;		punpcklbw	%mm7,%mm6;
	movq		%mm5,640+4*16+8(%edi);		punpckhbw	%mm7,%mm2;
	movq		%mm6,512+5*16+0(%edi);		movq		%mm4,%mm6;
	movq		(%ebx,%edx),%mm5;		punpcklbw	%mm7,%mm4;
	movq		%mm2,512+5*16+8(%edi);		punpckhbw	%mm7,%mm6;
	movq		%mm3,%mm2;			punpcklbw	%mm7,%mm3;
	movq		%mm4,640+5*16+0(%edi);		punpckhbw	%mm7,%mm2;
	movq		%mm1,%mm4;			punpcklbw	%mm7,%mm1;
	movq		%mm6,640+5*16+8(%edi);		punpckhbw	%mm7,%mm4;
	movq		%mm3,512+6*16+0(%edi);		movq		%mm0,%mm3;
	movq		%mm2,512+6*16+8(%edi);		punpcklbw	%mm7,%mm0;
	movq		%mm1,640+6*16+0(%edi);		punpckhbw	%mm7,%mm3;
	movq		(%esi,%edx,2),%mm1;		movq		%mm5,%mm2;
	movq		%mm0,512+7*16+0(%edi);		punpcklbw	%mm7,%mm5;
	movq		(%esi),%mm0;			punpckhbw	%mm7,%mm2;
	movq		%mm3,512+7*16+8(%edi);		movl		%esi,%eax;
	movl		%esi,%ebx;			movl		$7,%ecx;
	movq		%mm4,640+6*16+8(%edi);		movq		%mm7,%mm6;
	movq		%mm5,640+7*16+0(%edi);		movq		%mm7,%mm5;
	movq		%mm2,640+7*16+8(%edi);		shll		$1,%edx;

	/* Y left 8 x 16 */
1:
	movq		%mm0,%mm2;			punpcklbw	%mm5,%mm0;
	paddw		%mm0,%mm6;			punpckhbw	%mm5,%mm2;
	paddw		%mm2,%mm6;			movq		%mm0,%mm3;
	movq		%mm2,%mm4;			pmaddwd		%mm3,%mm3;
	movq		%mm0,(%edi);			lea		(%eax,%edx,2),%eax;
	movq		(%eax),%mm0;			pmaddwd		%mm4,%mm4;
	movq		%mm2,8(%edi);			paddd		%mm3,%mm7;
	movq		%mm1,%mm2;			addl		$32,%edi;
	paddd		%mm4,%mm7;			punpcklbw	%mm5,%mm1;
	paddw		%mm1,%mm6;			punpckhbw	%mm5,%mm2;
	paddw		%mm2,%mm6;			movq		%mm1,%mm3;
	movq		%mm2,%mm4;			pmaddwd		%mm3,%mm3;
	movq		%mm1,-16(%edi);			pmaddwd		%mm4,%mm4;
	movq		(%eax,%edx),%mm1;		decl		%ecx;
	movq		%mm2,-8(%edi);			paddd		%mm3,%mm7;
	paddd		%mm4,%mm7;			jne		1b;

	movl		%esi,%eax;			movq		%mm0,%mm2;
	addl		$8,%eax;			punpcklbw	%mm5,%mm0;
	paddw		%mm0,%mm6;			punpckhbw	%mm5,%mm2;
	paddw		%mm2,%mm6;			movq		%mm0,%mm3;
	movq		%mm2,%mm4;			pmaddwd		%mm3,%mm3;
	movq		%mm0,(%edi);			pmaddwd		%mm4,%mm4;
	movq		(%eax),%mm0;			paddd		%mm3,%mm7;
	movq		%mm2,8(%edi);			movq		%mm1,%mm2;
	paddd		%mm4,%mm7;			punpcklbw	%mm5,%mm1;
	paddw		%mm1,%mm6;			punpckhbw	%mm5,%mm2;
	paddw		%mm2,%mm6;			movq		%mm1,%mm3;
	movq		%mm2,%mm4;			pmaddwd		%mm3,%mm3;
	movq		%mm1,16(%edi);			pmaddwd		%mm4,%mm4;
	movq		%mm2,24(%edi);			addl		$32,%edi
	paddd		%mm3,%mm7;			movl		$7,%ecx;
	movq		(%eax,%edx),%mm1;		paddd		%mm4,%mm7;

	/* Y right 8 x 16 */
2:
	movq		%mm0,%mm2;			punpcklbw	%mm5,%mm0;
	paddw		%mm0,%mm6;			punpckhbw	%mm5,%mm2;
	paddw		%mm2,%mm6;			movq		%mm0,%mm3;
	movq		%mm2,%mm4;			pmaddwd		%mm3,%mm3;
	movq		%mm0,(%edi);			lea		(%eax,%edx,2),%eax;
	movq		(%eax),%mm0;			pmaddwd		%mm4,%mm4;
	movq		%mm2,8(%edi);			paddd		%mm3,%mm7;
	movq		%mm1,%mm2;			addl		$32,%edi;
	paddd		%mm4,%mm7;			punpcklbw	%mm5,%mm1;
	paddw		%mm1,%mm6;			punpckhbw	%mm5,%mm2;
	paddw		%mm2,%mm6;			movq		%mm1,%mm3;
	movq		%mm2,%mm4;			pmaddwd		%mm3,%mm3;
	movq		%mm1,-16(%edi);			pmaddwd		%mm4,%mm4;
	movq		(%eax,%edx),%mm1;		decl		%ecx;
	movq		%mm2,-8(%edi);			paddd		%mm3,%mm7;
	paddd		%mm4,%mm7;			jne		2b;

	movq		%mm0,%mm2;			punpcklbw	%mm5,%mm0;
	paddw		%mm0,%mm6;			punpckhbw	%mm5,%mm2;
	paddw		%mm2,%mm6;			movq		%mm0,%mm3;
	movq		%mm2,%mm4;			pmaddwd		%mm3,%mm3;
	movq		%mm0,(%edi);			pmaddwd		%mm4,%mm4;
	movq		%mm2,8(%edi);			paddd		%mm3,%mm7;
	movq		%mm1,%mm2;			punpcklbw	%mm5,%mm1;
	paddd		%mm4,%mm7;			punpckhbw	%mm5,%mm2;
	paddw		%mm1,%mm6;			movq		%mm1,%mm3;
	paddw		%mm2,%mm6;			pmaddwd		%mm3,%mm3;
	movq		%mm2,%mm4;			paddd		%mm3,%mm7;
	movq		%mm1,16(%edi);			pmaddwd		%mm4,%mm4;
	movq		%mm2,24(%edi);			movq		%mm6,%mm2;
	paddd		%mm4,%mm7;			psllq		$32,%mm6;
	paddw		%mm2,%mm6;			movq		%mm7,%mm4;
	movq		%mm6,%mm3;			pslld		$16,%mm6;
	paddw		%mm3,%mm6;			psrlq		$32,%mm7;
	paddd		%mm4,%mm7;			psrlq		$48,%mm6;
	movd		%mm6,%eax;			pslld		$8,%mm7;
	popl		%ecx;				mull		%eax;
	popl		%ebx;				movd		%mm7,%edi;
	popl		%esi;				subl		%edi,%eax;
	popl		%edx;				negl		%eax;
	popl		%edi;
	ret

# int
# mmx_YUYV_422(unsigned char *buffer, unsigned char *unused)

	.text
	.align		16
	.globl		mmx_YUYV_422

mmx_YUYV_422:

	leal		-16(%esp),%esp;
	movl		%ecx,12(%esp);
	movl		mb_col,%eax;		// + mb_col * 16 * 2
	movl		%edx,8(%esp);
	sall		$5,%eax;
	movl		filter_y_pitch,%edx;
	movl		%ebx,4(%esp);
	imull		mb_row,%edx;		// + mb_row * 16 * filter_y_pitch
	movl		%edi,(%esp);
	pxor		%mm6,%mm6;
	sall		$4,%edx;
	movq		c255,%mm5;
	addl		%edx,%eax;
	addl		filter_y_offs,%eax;	// + filter_y_offs
	pxor		%mm7,%mm7;
	addl		1*4+16(%esp),%eax;	// + buffer
	movq		(%eax),%mm0;
	movl		$mblock+512,%ebx;	// mblock[0][4] (chroma)
	movl		$mblock,%edi;
	movl		$7,%ecx;
	movl		filter_y_pitch,%edx;
1:
	movq		8(%eax),%mm4;			movq		%mm0,%mm3;
	movq		%mm0,%mm1;			punpcklwd	%mm4,%mm3;
	movq		%mm3,%mm2;			punpckhwd	%mm4,%mm1;
	pand		%mm5,%mm0;			punpcklwd	%mm1,%mm3;
	pand		%mm5,%mm4;			punpckhwd	%mm1,%mm2;
	movq		%mm0,(%edi);			paddw		%mm0,%mm6;
	psrlw		$8,%mm3;			pmaddwd		%mm0,%mm0;
	movq		%mm4,8(%edi);			paddw		%mm4,%mm6;
	psrlw		$8,%mm2;			pmaddwd		%mm4,%mm4;
	movq		%mm3,(%ebx);			paddd		%mm0,%mm7;
	movq		16(%eax),%mm0;			leal		32(%edi),%edi;
	movq		%mm2,128+0(%ebx);		paddd		%mm4,%mm7;
	movq		24(%eax),%mm4;			movq		%mm0,%mm3;
	movq		%mm0,%mm1;			punpcklwd	%mm4,%mm3;
	decl		%ecx;				punpckhwd	%mm4,%mm1;
	movq		%mm3,%mm2;			punpcklwd	%mm1,%mm3;
	pand		%mm5,%mm0;			punpckhwd	%mm1,%mm2;
	movq		(%eax,%edx),%mm1;		pand		%mm5,%mm4;
	movq		%mm0,256-32(%edi);		paddw		%mm0,%mm6;
	psrlw		$8,%mm3;			pmaddwd		%mm0,%mm0;
	movq		%mm4,256+8-32(%edi);		paddw		%mm4,%mm6;
	psrlw		$8,%mm2;			pmaddwd		%mm4,%mm4;
	movq		%mm3,8(%ebx);			paddd		%mm0,%mm7;
	movq		(%eax,%edx,2),%mm0;		pand		%mm5,%mm1;
	movq		%mm2,128+8(%ebx);		paddd		%mm4,%mm7;
	movq		8(%eax,%edx),%mm2;		paddw		%mm1,%mm6;
	movq		16(%eax,%edx),%mm3;		pand		%mm5,%mm2;
	movq		%mm1,16-32(%edi);		paddw		%mm2,%mm6;
	movq		24(%eax,%edx),%mm4;		pmaddwd		%mm1,%mm1;
	leal		(%eax,%edx,2),%eax;		pand		%mm5,%mm3;
	movq		%mm2,24-32(%edi);		pmaddwd		%mm2,%mm2;
	pand		%mm5,%mm4;			paddw		%mm3,%mm6;
	paddd		%mm1,%mm7;			paddw		%mm4,%mm6;
	movq		%mm4,%mm1;			pmaddwd		%mm4,%mm4;
	movq		%mm3,256+16-32(%edi);		pmaddwd		%mm3,%mm3;
	leal		16(%ebx),%ebx;			paddd		%mm2,%mm7;
	movq		%mm1,256+24-32(%edi);		paddd		%mm4,%mm7;
	paddd		%mm3,%mm7;			jne		1b

	movq		8(%eax),%mm4;			movq		%mm0,%mm3;
	movq		%mm0,%mm1;			punpcklwd	%mm4,%mm3;
	movq		%mm3,%mm2;			punpckhwd	%mm4,%mm1;
	pand		%mm5,%mm0;			punpcklwd	%mm1,%mm3;
	pand		%mm5,%mm4;			punpckhwd	%mm1,%mm2;
	movq		%mm0,(%edi);			paddw		%mm0,%mm6;
	psrlw		$8,%mm3;			pmaddwd		%mm0,%mm0;
	movq		%mm4,8(%edi);			paddw		%mm4,%mm6;
	psrlw		$8,%mm2;			pmaddwd		%mm4,%mm4;
	movq		%mm3,(%ebx);			paddd		%mm0,%mm7;
	movq		16(%eax),%mm0;			leal		32(%edi),%edi;
	movq		%mm2,128+0(%ebx);		paddd		%mm4,%mm7;
	movq		24(%eax),%mm4;			movq		%mm0,%mm3;
	movq		%mm0,%mm1;			punpcklwd	%mm4,%mm3;
	movq		%mm3,%mm2;			punpckhwd	%mm4,%mm1;
	pand		%mm5,%mm0;			punpcklwd	%mm1,%mm3;
	movq		%mm0,256-32(%edi);		punpckhwd	%mm1,%mm2;
	movq		(%eax,%edx),%mm1;		pand		%mm5,%mm4;
	psrlw		$8,%mm3;			paddw		%mm0,%mm6;
	movq		%mm4,256+8-32(%edi);		pmaddwd		%mm0,%mm0;
	psrlw		$8,%mm2;			paddw		%mm4,%mm6;
	movq		%mm3,8(%ebx);			pmaddwd		%mm4,%mm4;
	paddd		%mm0,%mm7;			pand		%mm5,%mm1;
	movq		%mm2,128+8(%ebx);		paddd		%mm4,%mm7;
	movq		8(%eax,%edx),%mm2;		paddw		%mm1,%mm6;
	movq		16(%eax,%edx),%mm3;		pand		%mm5,%mm2;
	movq		%mm1,16-32(%edi);		paddw		%mm2,%mm6;
	movq		24(%eax,%edx),%mm4;		pmaddwd		%mm1,%mm1;
	movq		%mm2,24-32(%edi);		pand		%mm5,%mm3;
	paddw		%mm3,%mm6;			pmaddwd		%mm2,%mm2;
	paddd		%mm1,%mm7;			pand		%mm5,%mm4;
	movq		%mm3,256+16-32(%edi);		paddw		%mm4,%mm6;
	paddd		%mm2,%mm7;			pmaddwd		%mm3,%mm3;
	movq		%mm4,256+24-32(%edi);		pmaddwd		%mm4,%mm4;
	movq		%mm6,%mm2;			psllq		$32,%mm6;
	paddd		%mm3,%mm7;			paddw		%mm2,%mm6;
	paddd		%mm4,%mm7;			movq		%mm6,%mm3;
	movq		%mm7,%mm5;			psrlq		$32,%mm7;
	paddd		%mm5,%mm7;			pslld		$16,%mm6;
	paddw		%mm3,%mm6;			pslld		$8,%mm7;
	movd		%mm7,%ecx;			psrlq		$48,%mm6;
	movd		%mm6,%eax;
	popl		%edi;
	mull		%eax;
	popl		%ebx;
	subl		%ecx,%eax;
	popl		%edx;
	negl		%eax;
	popl		%ecx;
	ret

# int
# mmx_YUYV_422_2v(unsigned char *buffer, unsigned char *unused)

	.text
	.align		16
	.globl		mmx_YUYV_422_2v

mmx_YUYV_422_2v:

	leal		-20(%esp),%esp;
	movl		%edx,16(%esp);
	movl		filter_y_pitch,%edx;
	movl		%esi,12(%esp);
	movl		mb_row,%esi;
	movl		%ebx,8(%esp);
	sall		$5,%esi;
	movl		%ecx,4(%esp);
	imull		%edx,%esi;
	movl		mb_col,%eax;
	movl		%edi,(%esp);
	sall		$5,%eax;
	addl		filter_y_offs,%esi;
	addl		%eax,%esi;
	addl		1*4+20(%esp),%esi;	// s1 = buffer + filter_y_pitch * mb_row * 32 + mb_col * 32 + filter_y_offs 
	leal		(%esi,%edx),%eax;	// s2 = s1 + filter_y_pitch
	sall		$1,%edx;		// filter_y_pitch * 2

	.align 16

filter_s2t:

	movl		$mblock,%edi;			movl		$8,%ecx;
	movl		$mblock+512,%ebx;		pxor		%mm6,%mm6;
	movq		c255,%mm5;			pxor		%mm7,%mm7;
1:	
	movq		(%esi),%mm0;			leal		16(%ebx),%ebx;
	movq		(%eax),%mm1;			movq		%mm0,%mm2;
	pand		%mm5,%mm0;			movq		%mm1,%mm4;
	paddw		c1,%mm0;			pand		%mm5,%mm1;
	movq		(%eax,%edx),%mm3;		paddw		%mm0,%mm1;
	movq		(%esi,%edx),%mm0;		psrlw		$1,%mm1;
	paddw		%mm1,%mm6;			psrlw		$8,%mm2;
	movq		%mm1,(%edi);			pmaddwd		%mm1,%mm1;
	decl		%ecx;				psrlw		$8,%mm4;
	paddw		%mm2,%mm4;			movq		%mm0,%mm2;
	paddd		%mm1,%mm7;			pand		%mm5,%mm0;
	paddw		c1,%mm0;			movq		%mm3,%mm1;
	pand		%mm5,%mm1;			psrlw		$8,%mm2;
	paddw		%mm0,%mm1;			psrlw		$8,%mm3;
	movq		8(%esi),%mm0;			psrlw		$1,%mm1;
	paddw		%mm1,%mm6;			paddw		%mm2,%mm4;
	movq		%mm1,16(%edi);			pmaddwd		%mm1,%mm1;
	movq		8(%eax),%mm5;			movq		%mm0,%mm2;
	paddw		%mm3,%mm4;			movq		%mm5,%mm3;
	pand		c255,%mm0;			paddd		%mm1,%mm7;
	paddw		c1,%mm0;			psrlw		$8,%mm2;
	pand		c255,%mm3;			psrlw		$8,%mm5;
	paddw		%mm0,%mm3;			paddw		%mm2,%mm5;
	movq		8(%esi,%edx),%mm0;		psrlw		$1,%mm3;
	paddw		%mm3,%mm6;			movq		%mm0,%mm2;
	movq		%mm3,8(%edi);			pmaddwd		%mm3,%mm3;
	pand		c255,%mm0;			psrlw		$8,%mm2;
	movq		8(%eax,%edx),%mm1;		paddw		%mm2,%mm5;
	paddd		%mm3,%mm7;			movq		%mm1,%mm3;
	pand		c255,%mm1;			psrlw		$8,%mm3;
	paddw		%mm0,%mm1;			paddw		%mm3,%mm5;
	paddw		c1,%mm1;			movq		%mm4,%mm3;
	movq		16(%esi),%mm0;			psrlw		$1,%mm1;
	paddw		%mm1,%mm6;			punpcklwd	%mm5,%mm4;
	movq		%mm1,24(%edi);			pmaddwd		%mm1,%mm1;
	punpckhwd	%mm5,%mm3;			movq		%mm4,%mm5;
	movq		%mm0,%mm2;			punpcklwd	%mm3,%mm4;
	paddw		c2,%mm4;			punpckhwd	%mm3,%mm5;
	paddw		c2,%mm5;			paddd		%mm1,%mm7;
	movq		16(%eax),%mm1;			psraw		$2,%mm4;
	movq		c255,%mm3;			psraw		$2,%mm5;
	movq		%mm4,-16(%ebx);			pand		%mm3,%mm0;
	movq		%mm5,128+0-16(%ebx);		movq		%mm1,%mm4;
	paddw		c1,%mm0;			pand		%mm3,%mm1;
	paddw		%mm0,%mm1;			psrlw		$8,%mm2;
	movq		16(%esi,%edx),%mm0;		psrlw		$1,%mm1;
	movq		%mm1,256(%edi);			paddw		%mm1,%mm6;
	pmaddwd		%mm1,%mm1;			psrlw		$8,%mm4;
	movq		16(%eax,%edx),%mm5;		paddw		%mm2,%mm4;
	movq		%mm0,%mm2;			pand		%mm3,%mm0;
	paddd		%mm1,%mm7;			movq		%mm5,%mm1;
	pand		%mm3,%mm1;			psrlw		$8,%mm2;
	paddw		%mm0,%mm1;			psrlw		$8,%mm5;
	paddw		c1,%mm1;			paddw		%mm2,%mm4;
	movq		24(%esi),%mm0;			psrlw		$1,%mm1;
	movq		%mm1,256+16(%edi);		paddw		%mm1,%mm6;
	pmaddwd		%mm1,%mm1;			paddw		%mm5,%mm4;
	movq		24(%eax),%mm3;			movq		%mm0,%mm2;
	pand		c255,%mm0;			movq		%mm3,%mm5;
	paddw		c1,%mm0;			psrlw		$8,%mm2;
	pand		c255,%mm5;			psrlw		$8,%mm3;
	paddw		%mm0,%mm5;			paddw		%mm2,%mm3;
	movq		24(%esi,%edx),%mm0;		psrlw		$1,%mm5;
	movq		%mm5,256+8(%edi);		paddw		%mm5,%mm6;
	paddd		%mm1,%mm7;			pmaddwd		%mm5,%mm5;
	movq		24(%eax,%edx),%mm1;		movq		%mm0,%mm2;
	psrlw		$8,%mm2;			leal		(%esi,%edx,2),%esi;
	pand		c255,%mm0;			paddw		%mm2,%mm3;
	paddd		%mm5,%mm7;			leal		(%eax,%edx,2),%eax;
	movq		%mm1,%mm5;			movq		%mm4,%mm2;
	pand		c255,%mm1;			psrlw		$8,%mm5;
	paddw		%mm5,%mm3;			paddw		%mm0,%mm1;
	paddw		c1,%mm1;			punpcklwd	%mm3,%mm4;
	psrlw		$1,%mm1;			punpckhwd	%mm3,%mm2;
	movq		%mm4,%mm3;			punpcklwd	%mm2,%mm4;
	paddw		%mm1,%mm6;			punpckhwd	%mm2,%mm3;
	movq		%mm1,256+24(%edi);		pmaddwd		%mm1,%mm1;
	paddw		c2,%mm4;			leal		32(%edi),%edi;
	paddw		c2,%mm3;			psraw		$2,%mm4;
	movq		c255,%mm5;			psraw		$2,%mm3;
	movq		%mm4,8-16(%ebx);		paddd		%mm1,%mm7;
	movq		%mm3,128+8-16(%ebx);		jne		1b;

	pmaddwd		c1,%mm6;			movq		%mm7,%mm0;
	psrlq		$32,%mm7;			popl		%edi;
	paddd		%mm0,%mm7;			popl		%ecx;
	movq		%mm6,%mm5;			psrlq		$32,%mm6;
	paddd		%mm5,%mm6;			pslld		$8,%mm7;
	movd		%mm6,%edx;			popl		%ebx;
	movd		%mm7,%eax;			imul		%edx,%edx;
	subl		%edx,%eax;			popl		%esi;
	popl		%edx;
	ret

# int
# mmx_YUYV_422_ti(unsigned char *buffer1, unsigned char *buffer2)

	.text
	.align		16
	.globl		mmx_YUYV_422_ti

mmx_YUYV_422_ti:

	movl		2*4(%esp),%eax;
	leal		-20(%esp),%esp;
	movl		%edx,16(%esp);
	movl		filter_y_pitch,%edx;		// filter_y_pitch
	movl		%esi,12(%esp);
	movl		mb_row,%esi;
	movl		%ebx,8(%esp);
	sall		$4,%esi;
	movl		%ecx,4(%esp);
	imull		%edx,%esi;
	movl		%edi,(%esp);
	movl		mb_col,%ecx;
	sall		$5,%ecx;
	addl		filter_y_offs,%esi;
	addl		%ecx,%esi;
	addl		%esi,%eax;		// s2 = buffer2 + filter_y_pitch * mb_row * 16 + mb_col * 32 + filter_y_offs
	addl		1*4+20(%esp),%esi;	// s1 = buffer1 + filter_y_pitch * mb_row * 16 + mb_col * 32 + filter_y_offs
	jmp		filter_s2t;

# int
# mmx_YUYV_422_vi(unsigned char *buffer, unsigned char *unused)

	.text
	.align		16
	.globl		mmx_YUYV_422_vi

mmx_YUYV_422_vi:

	leal		-20(%esp),%esp;
	movl		%esi,12(%esp);
	movl		mb_row,%esi;
	movl		%edx,16(%esp);
	cmpl		mb_last_row,%esi;
	movl		%ebx,8(%esp);
	jl		1f;
	movl		12(%esp),%esi;
	leal		20(%esp),%esp;
	jmp		mmx_YUYV_422;

	.p2align 4,,7
1:
	movl		filter_y_pitch,%edx;		// filter_y_pitch
	movl		%ecx,4(%esp);
	movl		%edi,(%esp);
	sall		$4,%esi;
	movl		2*4+20(%esp),%eax;
	movl		mb_col,%ecx;
	imull		%edx,%esi;
	sall		$5,%ecx;
	addl		filter_y_offs,%esi;
	addl		%ecx,%esi;
	addl		1*4+20(%esp),%esi;	// s1 = buffer + filter_y_pitch * mb_row * 16 + mb_col * 32 + filter_y_offs
	leal		(%esi,%edx),%eax;	// s2 = buffer + filter_y_pitch * (mb_row * 16 + 1) + mb_col * 32 + filter_y_offs
	jmp		filter_s2t;

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2001 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: libvideo.h,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#include "../rtepriv.h"
#include "../systems/libsystems.h"

#include "video.h" // XXX REMOVE

extern rte_codec_class	mp1e_mpeg1_video_codec;

extern void
video_init(rte_codec *codec, int cpu_type,
	   int coded_width, int coded_height,
	   int motion_min, int motion_max,
	   fifo *capture_fifo,
	   unsigned int module, multiplexer *mux);

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: mblock.c,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#include "video.h"
#include "../common/math.h"

int			mb_col, mb_row,			// current
    			mb_width, mb_height,
			mb_last_col, mb_last_row,
			mb_num;

uint8_t * newref;			/* future reference frame buffer */

/*
 *  Packed reference buffer format is
 *  [mb_height]
 *  [mb_width]  - for all macroblocks of a frame
 *  [6]         - Y0, Y2, Y1, Y3, Cb, Cr
 *  [8][8]      - 8 bit unsigned samples, e. g. according to ITU-R Rec. 601
 */

struct mb_addr		mb_address __attribute__ ((aligned (MIN(CACHE_LINE, 64))));

short			mblock[7][6][8][8] __attribute__ ((aligned (4096)));
/*
 *  Buffer for current macroblock
 *  [7]    - intra, forward, backward, interpolated
 *  [6]    - Y0, Y2, Y1, Y3, Cb, Cr
 *  [8][8] - samples, block difference, dct coefficients
 */

void
video_coding_size(int width, int height)
{
	mb_width  = (saturate(width, 1, MAX_WIDTH) + 15) >> 4;
	mb_height = (saturate(height, 1, MAX_HEIGHT) + 15) >> 4;

	mb_last_col = mb_width - 1;
	mb_last_row = mb_height - 1;

	mb_num    = mb_width * mb_height;
}

/*
 *  B picture: encode & discard; I or P picture must be encoded ahead of
 *  all B pictures forward referencing the I or P picture, ie. we will
 *  stack as many captured pictures as there are B pictures in a row
 *  plus the following I or P. The capture module may add one or two
 *  more for double buffering.
 */
int
video_look_ahead(char *gop_sequence)
{
	int i;
	int max = 0;
	int count = 0;

	for (i = 0; i < 1024; i++)
		switch (gop_sequence[i]) {
		case 'I':
		case 'P':
			max = MAX(count, max);
			count = 0;
			break;

		case 'B':
			count++;
			break;

		default:
			i = 1024;
		}

	return max + 1;
}

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: mblock.h,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#ifndef MBLOCK_H
#define MBLOCK_H

/* OBSOLETE */

#endif // MBLOCK_H

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *  Motion compensation V3.1.39
 *
 *  Copyright (C) 2001 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
[...3899 lines suppressed...]
	for (i = 0; i < 8; i++) {
		for (j = 0; j < 8; j++) {
			mblock[1][4][i][j] = mblock[0][4][i][j] - p1[j];
			mblock[2][4][i][j] = mblock[0][4][i][j] - p2[j];
			mblock[3][4][i][j] = mblock[0][4][i][j] - ((p1[j] + p2[j] + 1) >> 1);
			mblock[1][5][i][j] = mblock[0][5][i][j] - p1[j + mb_address.block[5].offset];
			mblock[2][5][i][j] = mblock[0][5][i][j] - p2[j + mb_address.block[5].offset];
			mblock[3][5][i][j] = mblock[0][5][i][j] - ((p1[j + mb_address.block[5].offset] + p2[j + mb_address.block[5].offset] + 1) >> 1);
		}

		p1 += mb_address.block[4].pitch;
		p2 += mb_address.block[4].pitch;
	}

	*vmc1 = sf * 256;
	*vmc2 = sb * 256;

	return si * 256;
}

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 2001 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: motion.h,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#ifndef MOTION_H
#define MOTION_H

#include "vlc.h"
#include "mblock.h"

#define reg(n) __attribute__ ((regparm (n)))

extern int		motion;
extern int		mm_buf_offs;

/* motion.c */

typedef unsigned int (search_fn)(int *dhx, int *dhy, unsigned char *from,
			int x, int y, int range, short dest[6][8][8]);

extern search_fn	mmx_search, _3dn_search, sse_search, sse2_search;
extern search_fn *	search;

extern unsigned int	predict_forward_packed(unsigned char *from) reg(1);
extern unsigned int	predict_forward_planar(unsigned char *from) reg(1);
extern unsigned int	predict_backward_packed(unsigned char *from) reg(1);
extern unsigned int	predict_bidirectional_packed(unsigned char *from1, unsigned char *from2, unsigned int *vmc1, unsigned int *vmc2);
extern unsigned int	predict_bidirectional_planar(unsigned char *from1, unsigned char *from2, unsigned int *vmc1, unsigned int *vmc2);

extern unsigned int	predict_forward_motion(struct motion *M, unsigned char *, int);
extern unsigned int	predict_bidirectional_motion(mpeg1_context *mpeg1, struct motion *M, unsigned int *, unsigned int *, int);

extern void		zero_forward_motion(void);
extern void		t7(int range, int dist);

/* motion_mmx.s */

/*
 *  NB we use mmx_predict_forward also for backward prediction (in B pictures
 *  within a closed gop, low profile) discarding the reconstruction.
 *  No mmx_predict_bidi_planar, use reference version.
 */
extern unsigned int	mmx_predict_forward_packed(unsigned char *) reg(1);
extern unsigned int	mmx_predict_forward_planar(unsigned char *) reg(1);
extern unsigned int	mmx_predict_bidirectional_packed(unsigned char *from1, unsigned char *from2, unsigned int *vmc1, unsigned int *vmc2);

/*
 *  Attention mmx_mbsum uses mblock[4] as permanent scratch in picture_i|p();
 *  Source mblock[0], dest mm_mbrow and bp
 */
extern void		mmx_mbsum(char * /* eax */) reg(1);
extern int		mmx_sad(unsigned char t[16][16] /* eax */, unsigned char *p /* edx */, int pitch /* ecx */) reg(3);
extern int		sse_sad(unsigned char t[16][16] /* eax */, unsigned char *p /* edx */, int pitch /* ecx */) reg(3);
/* <t> must be 16 byte aligned */
extern int		sse2_sad(unsigned char t[16][16] /* eax */, unsigned char *p /* edx */, int pitch /* ecx */) reg(3);

#endif /* MOTION_H */

--- NEW FILE ---
#
#  MPEG-1 Real Time Encoder
# 
#  Copyright (C) 1999-2000 Michael H. Schimek
# 
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License version 2 as
#  published by the Free Software Foundation.
# 
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
# 
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#

[...1211 lines suppressed...]
		psadbw		6*32+8(%eax),%mm1;
		paddw		%mm1,%mm7;
		movq		8(%edx),%mm1;
		psadbw		6*32+16(%eax),%mm2;
		paddw		%mm2,%mm7;
		movq		(%edx,%ecx),%mm2;
		psadbw		6*32+24(%eax),%mm3;
		paddw		%mm3,%mm7;
		movq		8(%edx,%ecx),%mm3;
		leal		(%edx,%ecx,2),%edx;
		psadbw		7*32(%eax),%mm0;
		paddw		%mm0,%mm7;
		psadbw		7*32+8(%eax),%mm1;
		paddw		%mm1,%mm7;
		psadbw		7*32+16(%eax),%mm2;
		paddw		%mm2,%mm7;
		psadbw		7*32+24(%eax),%mm3;
		paddw		%mm3,%mm7;
		movd		%mm7,%eax;
		ret;

--- NEW FILE ---
#
#  MPEG-1 Real Time Encoder
# 
#  Copyright (C) 2001 Michael H. Schimek
# 
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License version 2 as
#  published by the Free Software Foundation.
# 
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
# 
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#

# $Id: motion_sse2.s,v 1.1 2001/12/04 23:58:09 mswitch Exp $

		.text
		.align		16
		.globl		sse2_sad

# (%eax) assumed 16 byte aligned 

sse2_sad:
		movdqu		(%edx),%xmm0;
		pxor		%xmm7,%xmm7;
		movdqu		(%edx,%ecx),%xmm1;
		leal		(%edx,%ecx,2),%edx;
		movdqu		(%edx),%xmm2;
		movdqu		(%edx,%ecx),%xmm3;
		leal		(%edx,%ecx,2),%edx;
		psadbw		(%eax),%xmm0;
		paddw		%xmm0,%xmm7;
		movdqu		(%edx),%xmm0;
		psadbw		1*16(%eax),%xmm1;
		paddw		%xmm1,%xmm7;
		movdqu		(%edx,%ecx),%xmm1;
		leal		(%edx,%ecx,2),%edx;
		psadbw		2*16(%eax),%xmm2;
		paddw		%xmm2,%xmm7;
		movdqu		(%edx),%xmm2;
		psadbw		3*16(%eax),%xmm3;
		paddw		%xmm3,%xmm7;
		movdqu		(%edx,%ecx),%xmm3;
		leal		(%edx,%ecx,2),%edx;
		psadbw		4*16(%eax),%xmm0;
		paddw		%xmm0,%xmm7;
		movdqu		(%edx),%xmm0;
		psadbw		5*16(%eax),%xmm1;
		paddw		%xmm1,%xmm7;
		movdqu		(%edx,%ecx),%xmm1;
		leal		(%edx,%ecx,2),%edx;
		psadbw		6*16(%eax),%xmm2;
		paddw		%xmm2,%xmm7;
		movdqu		(%edx),%xmm2;
		psadbw		7*16(%eax),%xmm3;
		paddw		%xmm3,%xmm7;
		movdqu		(%edx,%ecx),%xmm3;
		leal		(%edx,%ecx,2),%edx;
		psadbw		8*16(%eax),%xmm0;
		paddw		%xmm0,%xmm7;
		movdqu		(%edx),%xmm0;
		psadbw		9*16(%eax),%xmm1;
		paddw		%xmm1,%xmm7;
		movdqu		(%edx,%ecx),%xmm1;
		leal		(%edx,%ecx,2),%edx;
		psadbw		10*16(%eax),%xmm2;
		paddw		%xmm2,%xmm7;
		movdqu		(%edx),%xmm2;
		psadbw		11*16(%eax),%xmm3;
		paddw		%xmm3,%xmm7;
		movdqu		(%edx,%ecx),%xmm3;
		psadbw		12*16(%eax),%xmm0;
		paddw		%xmm0,%xmm7;
		psadbw		13*16(%eax),%xmm1;
		paddw		%xmm1,%xmm7;
		psadbw		14*16(%eax),%xmm2;
		paddw		%xmm2,%xmm7;
		psadbw		15*16(%eax),%xmm3;
		paddw		%xmm3,%xmm7;
		pshufd		$1*64+0*16+3*4+2,%xmm7,%xmm6;
		paddw		%xmm6,%xmm7;
		movd		%xmm7,%eax;
		ret;

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: mpeg.h,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#ifndef MPEG_H
#define MPEG_H

#define PICTURE_START_CODE		0x00000100L
#define SLICE_START_CODE		0x00000101L
#define USER_DATA_START_CODE		0x000001B2L
#define SEQUENCE_HEADER_CODE		0x000001B3L
#define SEQUENCE_ERROR_CODE		0x000001B4L
#define EXTENSION_START_CODE		0x000001B5L
#define SEQUENCE_END_CODE		0x000001B7L
#define GROUP_START_CODE		0x000001B8L

typedef enum {
	I_TYPE = 1,
	P_TYPE,
	B_TYPE,
	D_TYPE,
} picture_type;

typedef enum {
	SEQUENCE_EXTENSION_ID =	1,
	SEQUENCE_DISPLAY_EXTENSION_ID,
	QUANT_MATRIX_EXTENSION_ID,
	COPYRIGHT_EXTENSION_ID,
	SEQUENCE_SCALABLE_EXTENSION_ID,
	PICTURE_DISPLAY_EXTENSION_ID = 7,
	PICTURE_CODING_EXTENSION_ID,
	PICTURE_SPATIAL_SCALABLE_EXTENSION_ID,
	PICTURE_TEMPORAL_SCALABLE_EXTENSION_ID,
} extension_id;

typedef enum {
	MB_INTRA,
	MB_FORWARD,
	MB_BACKWARD,
	MB_INTERP
} mb_type;

/* tables.c */

extern const double frame_rate_value[16];
extern const unsigned char default_intra_quant_matrix[8][8];
extern const unsigned char default_inter_quant_matrix[8][8];
extern const unsigned char scan[2][8][8];
extern const unsigned char quantiser_scale[2][32];
extern const unsigned long long macroblock_address_increment_vlc[33];
extern const unsigned long long coded_block_pattern_vlc[64];
extern const unsigned long long motion_code_vlc[17];
extern const unsigned long long dct_dc_size_luma_vlc[12];
extern const unsigned long long dct_dc_size_chroma_vlc[12];

extern int mp1e_vlc(unsigned long long, unsigned int *);
extern int mp1e_dct_coeff_vlc(int table, int run, int level, unsigned int *);

#endif /* MPEG_H */

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2001 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

[...2414 lines suppressed...]
	rte_helper_reset_options(&mpeg1->codec);

	return &mpeg1->codec;
}

rte_codec_class
mp1e_mpeg1_video_codec = {
	.public = {
		.stream_type = RTE_STREAM_VIDEO,
		.keyword = "mpeg1_video",
		.label = "MPEG-1 Video",
	},

	.new		= codec_new,
	.delete         = codec_delete,
	.option_enum	= option_enum,
	.option_get	= option_get,
	.option_set	= option_set,
	.option_print	= option_print,
};

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: tables.c,v 1.1 2001/12/04 23:58:09 mswitch Exp $ */

#include "mpeg.h"

/*
 *  ISO 13818-2 Table 6.4
 */
const double
frame_rate_value[16] =
{
	0,
	24000.0 / 1001, 24.0,
	25.0, 30000.0 / 1001, 30.0,
	50.0, 60000.0 / 1001, 60.0
};

/*
 *  ISO 13818-2 6.3.11
 */
const unsigned char
default_intra_quant_matrix[8][8] =
{
	{  8, 16, 19, 22, 26, 27, 29, 34 },
	{ 16, 16, 22, 24, 27, 29, 34, 37 },
	{ 19, 22, 26, 27, 29, 34, 34, 38 },
	{ 22, 22, 26, 27, 29, 34, 37, 40 },
	{ 22, 26, 27, 29, 32, 35, 40, 48 },
	{ 26, 27, 29, 32, 35, 40, 48, 58 },
	{ 26, 27, 29, 34, 38, 46, 56, 69 },
	{ 27, 29, 35, 38, 46, 56, 69, 83 }
};

const unsigned char
default_inter_quant_matrix[8][8] =
{
        { 16, 16, 16, 16, 16, 16, 16, 16 },
        { 16, 16, 16, 16, 16, 16, 16, 16 },
        { 16, 16, 16, 16, 16, 16, 16, 16 },
        { 16, 16, 16, 16, 16, 16, 16, 16 },
        { 16, 16, 16, 16, 16, 16, 16, 16 },
        { 16, 16, 16, 16, 16, 16, 16, 16 },
        { 16, 16, 16, 16, 16, 16, 16, 16 },
        { 16, 16, 16, 16, 16, 16, 16, 16 },
};

/*
 *  ISO 13818-2 Figure 7-2, 7-3
 */
const unsigned char
scan[2][8][8] =
{
	{
		{  0,  1,  5,  6, 14, 15, 27, 28 }, 
		{  2,  4,  7, 13, 16, 26, 29, 42 },
		{  3,  8, 12, 17, 25, 30, 41, 43 },
		{  9, 11, 18, 24, 31, 40, 44, 53 },
		{ 10, 19, 23, 32, 39, 45, 52, 54 },
		{ 20, 22, 33, 38, 46, 51, 55, 60 },
		{ 21, 34, 37, 47, 50, 56, 59, 61 },
		{ 35, 36, 48, 49, 57, 58, 62, 63 }
	}, {
		{  0,  4,  6, 20, 22, 36, 38, 52 },
		{  1,  5,  7, 21, 23, 37, 39, 53 },
		{  2,  8, 19, 24, 34, 40, 50, 54 },
		{  3,  9, 18, 25, 35, 41, 51, 55 },
		{ 10, 17, 26, 30, 42, 46, 56, 60 },
		{ 11, 16, 27, 31, 43, 47, 57, 61 },
		{ 12, 15, 28, 32, 44, 48, 58, 62 },
		{ 13, 14, 29, 33, 45, 49, 59, 63 }
	}
};

/*
 *  ISO 13818-2 Table 7.6
 */
const unsigned char
quantiser_scale[2][32] =
{
	{ 0, 2, 4, 6, 8, 10, 12, 14,
	  16, 18, 20, 22, 24, 26, 28, 30,
	  32, 34, 36, 38, 40, 42, 44, 46,
	  48, 50, 52, 54, 56, 58, 60, 62 },

	{ 0, 1, 2, 3, 4, 5, 6, 7,
	  8, 10, 12, 14, 16, 18, 20, 22,
	  24, 28, 32, 36, 40, 44, 48, 52,
	  56, 64, 72, 80, 88, 96, 104, 112 }
};

/*
 *  Variable Length Codes
 */

#define VLC(bits) (((0 ## bits) << 5) | (sizeof(# bits) - 1)) // MAX. 19 BITS!

/*
 *  ISO 13818 Table B-1
 *  Variable length codes for macroblock_address_increment
 */
const unsigned long long
macroblock_address_increment_vlc[33] =
{
	VLC(1),
	VLC(011),
	VLC(010),
	VLC(0011),
	VLC(0010),
	VLC(00011),
	VLC(00010),
	VLC(0000111),
	VLC(0000110),
	VLC(00001011),
	VLC(00001010),
	VLC(00001001),
	VLC(00001000),
	VLC(00000111),
	VLC(00000110),
	VLC(0000010111),
	VLC(0000010110),
	VLC(0000010101),
	VLC(0000010100),
	VLC(0000010011),
	VLC(0000010010),
	VLC(00000100011),
	VLC(00000100010),
	VLC(00000100001),
	VLC(00000100000),
	VLC(00000011111),
	VLC(00000011110),
	VLC(00000011101),
	VLC(00000011100),
	VLC(00000011011),
	VLC(00000011010),
	VLC(00000011001),
	VLC(00000011000)
	// VLC(00000001000) macroblock_escape code
};

/*
 *  ISO 13818-2 Table B-9
 *  Variable length codes for coded_block_pattern
 */
const unsigned long long
coded_block_pattern_vlc[64] =
{
	VLC(000000001), // This entry shall not be used with 4:2:0 chrominance structure
	VLC(01011),
	VLC(01001),		
	VLC(001101),		
	VLC(1101),		
	VLC(0010111),	
	VLC(0010011),	
	VLC(00011111),	
	VLC(1100),		
	VLC(0010110),	
	VLC(0010010),	
	VLC(00011110),	
	VLC(10011),		
	VLC(00011011),
	VLC(00010111),
	VLC(00010011),
	VLC(1011),		
	VLC(0010101),	
	VLC(0010001),	
	VLC(00011101),	
	VLC(10001),		
	VLC(00011001),
	VLC(00010101),
	VLC(00010001),
	VLC(001111),	
	VLC(00001111),
	VLC(00001101),
	VLC(000000011),
	VLC(01111),		
	VLC(00001011),
	VLC(00000111),
	VLC(000000111),
	VLC(1010),		
	VLC(0010100),	
	VLC(0010000),	
	VLC(00011100),
	VLC(001110),	
	VLC(00001110),
	VLC(00001100),
	VLC(000000010),
	VLC(10000),		
	VLC(00011000),
	VLC(00010100),
	VLC(00010000),
	VLC(01110),		
	VLC(00001010),
	VLC(00000110),
	VLC(000000110),
	VLC(10010),		
	VLC(00011010),
	VLC(00010110),
	VLC(00010010),
	VLC(01101),		
	VLC(00001001),
	VLC(00000101),
	VLC(000000101),
	VLC(01100),		
	VLC(00001000),
	VLC(00000100),
	VLC(000000100),
	VLC(111),		
	VLC(01010),		
	VLC(01000),		
	VLC(001100)	
};

/*
 *  ISO 13818 Table B-10
 *  Variable length codes for motion_code (not including sign bit)
 */
const unsigned long long
motion_code_vlc[17] =
{
	VLC(1),			// 0
	VLC(01),		// 1
	VLC(001),
	VLC(0001),
	VLC(000011),
	VLC(0000101),
	VLC(0000100),
	VLC(0000011),
	VLC(000001011),
	VLC(000001010),
	VLC(000001001),
	VLC(0000010001),
	VLC(0000010000),
	VLC(0000001111),
	VLC(0000001110),
	VLC(0000001101),	// 15
	VLC(0000001100)		// 16
};

/*
 *  ISO 13818-2 Table B-12
 *  Variable length codes for dct_dc_size_luminance
 */
const unsigned long long
dct_dc_size_luma_vlc[12] =
{
	VLC(100),
	VLC(00),
	VLC(01),
	VLC(101),
	VLC(110),
	VLC(1110),
	VLC(11110),
	VLC(111110),
	VLC(1111110),
	VLC(11111110),
	VLC(111111110),
	VLC(111111111)
};

/*
 *  ISO 13818-2 Table B-13
 *  Variable length codes for dct_dc_size_chrominance
 */
const unsigned long long
dct_dc_size_chroma_vlc[12] =
{
	VLC(00),
	VLC(01),
	VLC(10),
	VLC(110),
	VLC(1110),
	VLC(11110),
	VLC(111110),
	VLC(1111110),
	VLC(11111110),
	VLC(111111110),
	VLC(1111111110),
	VLC(1111111111)
};

struct dct_coeff {
	unsigned long long	code;
	char			run, level;
};

/*
 *  ISO 13818-2 Table B-14
 *  DCT coefficients table zero (not including sign bit)
 */
static const struct dct_coeff
dct_coeff_zero_vlc[] =
{
	// VLC(10) End of Block
	// { VLC(1), 0, 1 } This code shall be used
	// for the first (DC) coefficient of a non-intra block
	{ VLC(11), 0, 1 },
	{ VLC(011), 1, 1 },
	{ VLC(0100), 0, 2 },
	{ VLC(0101), 2, 1 },
	{ VLC(00101), 0, 3 },
	{ VLC(00111), 3, 1 },
	{ VLC(00110), 4, 1 },
	{ VLC(000110), 1, 2 },
	{ VLC(000111), 5, 1 },
	{ VLC(000101), 6, 1 },
	{ VLC(000100), 7, 1 },
	{ VLC(0000110), 0, 4 },
	{ VLC(0000100), 2, 2 },
	{ VLC(0000111), 8, 1 },
	{ VLC(0000101), 9, 1 },
	// VLC(000001) Escape code
	{ VLC(00100110), 0, 5 },
	{ VLC(00100001), 0, 6 },
	{ VLC(00100101), 1, 3 },
	{ VLC(00100100), 3, 2 },
	{ VLC(00100111), 10, 1 },
	{ VLC(00100011), 11, 1 },
	{ VLC(00100010), 12, 1 },
	{ VLC(00100000), 13, 1 },
	{ VLC(0000001010), 0, 7 },
	{ VLC(0000001100), 1, 4 },
	{ VLC(0000001011), 2, 3 },
	{ VLC(0000001111), 4, 2 },
	{ VLC(0000001001), 5, 2 },
	{ VLC(0000001110), 14, 1 },
	{ VLC(0000001101), 15, 1 },
	{ VLC(0000001000), 16, 1 },
	{ VLC(000000011101), 0, 8 },
	{ VLC(000000011000), 0, 9 },
	{ VLC(000000010011), 0, 10 },
	{ VLC(000000010000), 0, 11 },
	{ VLC(000000011011), 1, 5 },
	{ VLC(000000010100), 2, 4 },
	{ VLC(000000011100), 3, 3 },
	{ VLC(000000010010), 4, 3 },
	{ VLC(000000011110), 6, 2 },
	{ VLC(000000010101), 7, 2 },
	{ VLC(000000010001), 8, 2 },
	{ VLC(000000011111), 17, 1 },
	{ VLC(000000011010), 18, 1 },
	{ VLC(000000011001), 19, 1 },
	{ VLC(000000010111), 20, 1 },
	{ VLC(000000010110), 21, 1 },
	{ VLC(0000000011010), 0, 12 },
	{ VLC(0000000011001), 0, 13 },
	{ VLC(0000000011000), 0, 14 },
	{ VLC(0000000010111), 0, 15 },
	{ VLC(0000000010110), 1, 6 },
	{ VLC(0000000010101), 1, 7 },
	{ VLC(0000000010100), 2, 5 },
	{ VLC(0000000010011), 3, 4 },
	{ VLC(0000000010010), 5, 3 },
	{ VLC(0000000010001), 9, 2 },
	{ VLC(0000000010000), 10, 2 },
	{ VLC(0000000011111), 22, 1 },
	{ VLC(0000000011110), 23, 1 },
	{ VLC(0000000011101), 24, 1 },
	{ VLC(0000000011100), 25, 1 },
	{ VLC(0000000011011), 26, 1 },
	{ VLC(00000000011111), 0, 16 },
	{ VLC(00000000011110), 0, 17 },
	{ VLC(00000000011101), 0, 18 },
	{ VLC(00000000011100), 0, 19 },
	{ VLC(00000000011011), 0, 20 },
	{ VLC(00000000011010), 0, 21 },
	{ VLC(00000000011001), 0, 22 },
	{ VLC(00000000011000), 0, 23 },
	{ VLC(00000000010111), 0, 24 },
	{ VLC(00000000010110), 0, 25 },
	{ VLC(00000000010101), 0, 26 },
	{ VLC(00000000010100), 0, 27 },
	{ VLC(00000000010011), 0, 28 },
	{ VLC(00000000010010), 0, 29 },
	{ VLC(00000000010001), 0, 30 },
	{ VLC(00000000010000), 0, 31 },
	{ VLC(000000000011000), 0, 32 },
	{ VLC(000000000010111), 0, 33 },
	{ VLC(000000000010110), 0, 34 },
	{ VLC(000000000010101), 0, 35 },
	{ VLC(000000000010100), 0, 36 },
	{ VLC(000000000010011), 0, 37 },
	{ VLC(000000000010010), 0, 38 },
	{ VLC(000000000010001), 0, 39 },
	{ VLC(000000000010000), 0, 40 },
	{ VLC(000000000011111), 1, 8 },
	{ VLC(000000000011110), 1, 9 },
	{ VLC(000000000011101), 1, 10 },
	{ VLC(000000000011100), 1, 11 },
	{ VLC(000000000011011), 1, 12 },
	{ VLC(000000000011010), 1, 13 },
	{ VLC(000000000011001), 1, 14 },
	{ VLC(0000000000010011), 1, 15 },
	{ VLC(0000000000010010), 1, 16 },
	{ VLC(0000000000010001), 1, 17 },
	{ VLC(0000000000010000), 1, 18 },
	{ VLC(0000000000010100), 6, 3 },
	{ VLC(0000000000011010), 11, 2 },
	{ VLC(0000000000011001), 12, 2 },
	{ VLC(0000000000011000), 13, 2 },
	{ VLC(0000000000010111), 14, 2 },
	{ VLC(0000000000010110), 15, 2 },
	{ VLC(0000000000010101), 16, 2 },
	{ VLC(0000000000011111), 27, 1 },
	{ VLC(0000000000011110), 28, 1 },
	{ VLC(0000000000011101), 29, 1 },
	{ VLC(0000000000011100), 30, 1 },
	{ VLC(0000000000011011), 31, 1 },
	{ VLC(0), -1, -1 }
};

/*
 *  ISO 13818-2 Table B-15
 *  DCT coefficients table one (not including sign bit)
 */
static const struct dct_coeff
dct_coeff_one_vlc[] =
{
	// VLC(0110) End of Block
	{ VLC(10), 0, 1 },
	{ VLC(010), 1, 1 },
	{ VLC(110), 0, 2 },
	{ VLC(00101), 2, 1 },
	{ VLC(0111), 0, 3 },
	{ VLC(00111), 3, 1 },
	{ VLC(000110), 4, 1 },
	{ VLC(00110), 1, 2 },
	{ VLC(000111), 5, 1 },
	{ VLC(0000110), 6, 1 },
	{ VLC(0000100), 7, 1 },
	{ VLC(11100), 0, 4 },
	{ VLC(0000111), 2, 2 },
	{ VLC(0000101), 8, 1 },
	{ VLC(1111000), 9, 1 },
	// VLC(000001) Escape code
	{ VLC(11101), 0, 5 },
	{ VLC(000101), 0, 6 },
	{ VLC(1111001), 1, 3 },
	{ VLC(00100110), 3, 2 },
	{ VLC(1111010), 10, 1 },
	{ VLC(00100001), 11, 1 },
	{ VLC(00100101), 12, 1 },
	{ VLC(00100100), 13, 1 },
	{ VLC(000100), 0, 7 },
	{ VLC(00100111), 1, 4 },
	{ VLC(11111100), 2, 3 },
	{ VLC(11111101), 4, 2 },
	{ VLC(000000100), 5, 2 },
	{ VLC(000000101), 14, 1 },
	{ VLC(000000111), 15, 1 },
	{ VLC(0000001101), 16, 1 },
	{ VLC(1111011), 0, 8 },
	{ VLC(1111100), 0, 9 },
	{ VLC(00100011), 0, 10 },
	{ VLC(00100010), 0, 11 },
	{ VLC(00100000), 1, 5 },
	{ VLC(0000001100), 2, 4 },
	{ VLC(000000011100), 3, 3 },
	{ VLC(000000010010), 4, 3 },
	{ VLC(000000011110), 6, 2 },
	{ VLC(000000010101), 7, 2 },
	{ VLC(000000010001), 8, 2 },
	{ VLC(000000011111), 17, 1 },
	{ VLC(000000011010), 18, 1 },
	{ VLC(000000011001), 19, 1 },
	{ VLC(000000010111), 20, 1 },
	{ VLC(000000010110), 21, 1 },
	{ VLC(11111010), 0, 12 },
	{ VLC(11111011), 0, 13 },
	{ VLC(11111110), 0, 14 },
	{ VLC(11111111), 0, 15 },
	{ VLC(0000000010110), 1, 6 },
	{ VLC(0000000010101), 1, 7 },
	{ VLC(0000000010100), 2, 5 },
	{ VLC(0000000010011), 3, 4 },
	{ VLC(0000000010010), 5, 3 },
	{ VLC(0000000010001), 9, 2 },
	{ VLC(0000000010000), 10, 2 },
	{ VLC(0000000011111), 22, 1 },
	{ VLC(0000000011110), 23, 1 },
	{ VLC(0000000011101), 24, 1 },
	{ VLC(0000000011100), 25, 1 },
	{ VLC(0000000011011), 26, 1 },
	{ VLC(00000000011111), 0, 16 },
	{ VLC(00000000011110), 0, 17 },
	{ VLC(00000000011101), 0, 18 },
	{ VLC(00000000011100), 0, 19 },
	{ VLC(00000000011011), 0, 20 },
	{ VLC(00000000011010), 0, 21 },
	{ VLC(00000000011001), 0, 22 },
	{ VLC(00000000011000), 0, 23 },
	{ VLC(00000000010111), 0, 24 },
	{ VLC(00000000010110), 0, 25 },
	{ VLC(00000000010101), 0, 26 },
	{ VLC(00000000010100), 0, 27 },
	{ VLC(00000000010011), 0, 28 },
	{ VLC(00000000010010), 0, 29 },
	{ VLC(00000000010001), 0, 30 },
	{ VLC(00000000010000), 0, 31 },
	{ VLC(000000000011000), 0, 32 },
	{ VLC(000000000010111), 0, 33 },
	{ VLC(000000000010110), 0, 34 },
	{ VLC(000000000010101), 0, 35 },
	{ VLC(000000000010100), 0, 36 },
	{ VLC(000000000010011), 0, 37 },
	{ VLC(000000000010010), 0, 38 },
	{ VLC(000000000010001), 0, 39 },
	{ VLC(000000000010000), 0, 40 },
	{ VLC(000000000011111), 1, 8 },
	{ VLC(000000000011110), 1, 9 },
	{ VLC(000000000011101), 1, 10 },
	{ VLC(000000000011100), 1, 11 },
	{ VLC(000000000011011), 1, 12 },
	{ VLC(000000000011010), 1, 13 },
	{ VLC(000000000011001), 1, 14 },
	{ VLC(0000000000010011), 1, 15 },
	{ VLC(0000000000010010), 1, 16 },
	{ VLC(0000000000010001), 1, 17 },
	{ VLC(0000000000010000), 1, 18 },
	{ VLC(0000000000010100), 6, 3 },
	{ VLC(0000000000011010), 11, 2 },
	{ VLC(0000000000011001), 12, 2 },
	{ VLC(0000000000011000), 13, 2 },
	{ VLC(0000000000010111), 14, 2 },
	{ VLC(0000000000010110), 15, 2 },
	{ VLC(0000000000010101), 16, 2 },
	{ VLC(0000000000011111), 27, 1 },
	{ VLC(0000000000011110), 28, 1 },
	{ VLC(0000000000011101), 29, 1 },
	{ VLC(0000000000011100), 30, 1 },
	{ VLC(0000000000011011), 31, 1 },
	{ VLC(0), -1, -1 }
};

/*
 *  Translate VLC(), returns bit length
 */
int
mp1e_vlc(unsigned long long vlc_octet, unsigned int *code)
{
	int i;

	*code = 0;

	for (i = 0; i < 19; i++)
		if (vlc_octet & (1ULL << (i * 3 + 5)))
			*code |= 1 << i;

	return vlc_octet & 0x1F;
}

/*
 *  Find dct_vlc, not including sign bit
 *  (append 0 for positive level, 1 for negative level)
 */
int
mp1e_dct_coeff_vlc(int table, int run, int level, unsigned int *vlcp)
{
	const struct dct_coeff *dcp;

	for (dcp = table ? dct_coeff_one_vlc : dct_coeff_zero_vlc; dcp->run >= 0; dcp++)
		if (dcp->run == run && dcp->level == level)
			return mp1e_vlc(dcp->code, vlcp);

	return -1; // No vlc for this run/length combination
}

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: video.h,v 1.1 2001/12/04 23:58:10 mswitch Exp $ */

#ifndef VIDEO_H
#define VIDEO_H

#include <stdint.h>

#include "../common/bstream.h"
#include "../common/fifo.h"
#include "../common/log.h"
#include "../common/math.h"
#include "../common/sync.h"
#include "mblock.h"

#include "libvideo.h"
#include "mpeg.h"

#define MAX_WIDTH 1024			/* 1 ... 4096 */
#define MAX_HEIGHT 1024			/* 1 ... 2800 */

#define reg(n) __attribute__ ((regparm (n)))
#define elements(array) (sizeof(array) / sizeof(array[0]))

struct rc {
	int		ni, np, nb, ob;		/* picture types per GOP */
	long long	Ei, Ep, Eb;
	long long	gop_count;
	double		ei, ep, eb;
	int		G0, Gn;			/* estimated target bits per GOP */
	double		G4;
	int		Tavg;			/* estimated avg. bits per frame */
	int		Tmin;			/* minimum target bits per frame */
	int		R;			/* remaining bits in GOP */

	double		Xi, Xp, Xb;		/* global complexity measure */
	double		d0i, d0p, d0b;		/* virtual buffer fullness */
	double		r31;			/* reaction parameter */

	double		avg_acti, avg_actp;	/* avg spatial activity, intra/inter */

	/* auto */

	double		act_sumi, act_sump;	/* sum spatial activity, intra/inter */
	double		Ti, Tmb;
	int		T;
};

/*
 *  Max. successive P pictures when overriding gop_sequence
 *  (error accumulation) and max. successive B pictures we can stack up
 */
#define MAX_P_SUCC 3
#define MAX_B_SUCC 31

#define B_SHARE 1.4

static inline void
rc_picture_start(struct rc *rc, picture_type type, int mb_num)
{
	switch (type) {
	case I_TYPE:
		/*
		 *  T = lroundn(R / (+ (ni) * Xi / (Xi * 1.0)
		 *		     + (np) * Xp / (Xi * 1.0)
		 *		     + (nb) * Xb / (Xi * 1.4)));
		 */
		rc->T = lroundn(rc->R / ((rc->ni + rc->ei)
					 + ((rc->np + rc->ep) * rc->Xp
					    + (rc->nb + rc->eb) * rc->Xb / B_SHARE)
					 / rc->Xi));
		rc->Ti = -rc->d0i;
		break;

	case P_TYPE:
		rc->T = lroundn(rc->R / ((rc->np + rc->ep)
					 + ((rc->ni + rc->ei) * rc->Xi
					    + (rc->nb + rc->eb) * rc->Xb / B_SHARE)
					 / rc->Xp));
		rc->Ti = -rc->d0p;
		break;

	case B_TYPE:
		/*
		 *  T = lroundn(R / (+ (ni + ei) * Xi * 1.4 / Xb
		 *		     + (np + ep) * Xp * 1.4 / Xb
		 *		     + (nb + eb) * Xb / Xb));
		 */
		rc->T = lroundn(rc->R / (((rc->ni + rc->ei) * rc->Xi
					  + (rc->np + rc->ep) * rc->Xp) * B_SHARE
					 / rc->Xb + (rc->nb + rc->eb)));
		rc->Ti = -rc->d0b;
		break;

	default:
		FAIL("!reached");
	}

	if (rc->T < rc->Tmin)
		rc->T = rc->Tmin;

	rc->Tmb = rc->T / mb_num;

	rc->act_sumi = 0.0;
	rc->act_sump = 0.0;
}

static inline int
rc_quant(struct rc *rc, mb_type type,
	 double acti, double actp,
	 int bits_out, int qs, int quant_max)
{
	int quant;

	switch (type) {
	case MB_INTRA:
		rc->act_sumi += acti;
		acti = (2.0 * acti + rc->avg_acti) / (acti + 2.0 * rc->avg_acti);
		quant = lroundn((bits_out - rc->Ti) * rc->r31 * acti);
		quant = saturate(quant >> qs, 1, quant_max);
		rc->Ti += rc->Tmb;
		break;

	case MB_FORWARD:
	case MB_BACKWARD:
		rc->act_sumi += acti;
		rc->act_sump += actp;
		actp = (2.0 * actp + rc->avg_actp) / (actp + 2.0 * rc->avg_actp);
		quant = lroundn((bits_out - rc->Ti) * rc->r31 * actp);
		quant = saturate(quant >> qs, 1, quant_max);
		rc->Ti += rc->Tmb;
		break;

	case MB_INTERP:
		rc->act_sumi += acti;
		rc->act_sump += actp;
		actp = (2.0 * actp + rc->avg_actp) / (actp + 2.0 * rc->avg_actp);
		quant = lroundn((bits_out - rc->Ti) * rc->r31 * actp);
		/* quant = saturate(quant, 1, quant_max); */
		rc->Ti += rc->Tmb;
		break;

	default:
		FAIL("!reached");
	}

	return quant;
}

static inline void
rc_picture_end(struct rc *rc, picture_type type,
	       int S, int quant_sum, int mb_num)
{
	switch (type) {
	case I_TYPE:
		rc->avg_acti = rc->act_sumi / mb_num;
		rc->Xi = lroundn(S * (double) quant_sum / mb_num);
		rc->d0i += S - rc->T; /* bits encoded - estimated bits */
		break;

	case P_TYPE:
		rc->avg_acti = rc->act_sumi / mb_num;
		rc->avg_actp = rc->act_sump / mb_num;
		rc->Xp = lroundn(S * (double) quant_sum / mb_num);
		rc->d0p += S - rc->T;
		break;

	case B_TYPE:
		rc->avg_acti = rc->act_sumi / mb_num;
		rc->avg_actp = rc->act_sump / mb_num;
		rc->Xb = lroundn(S * (double) quant_sum / mb_num);
		rc->d0b += S - rc->T;
		break;

	default:
		FAIL("!reached");
	}
}

typedef struct stacked_frame {
	uint8_t *	org;
	buffer *	buffer;
	double		time;
	int		skipped;
} stacked_frame;

typedef struct mpeg1_context mpeg1_context;

struct mpeg1_context {
	uint8_t		seq_header_template[32];

	uint8_t *	zerop_template;		/* empty P picture */
	int		Sz;			/* .. size in bytes */

	int		(* picture_i)(mpeg1_context *, uint8_t *org);
	int		(* picture_p)(mpeg1_context *, uint8_t *org,
				      int dist, int forward_motion);
	int		(* picture_b)(mpeg1_context *, uint8_t *org,
				      int dist, int forward_motion,
				      int backward_motion);

	unsigned int	(* predict_forward)(uint8_t *from) reg(1);
	unsigned int	(* predict_bidirectional)(uint8_t *from1, uint8_t *from2,
						  unsigned int *vmc1,
						  unsigned int *vmc2);

	stacked_frame	stack[MAX_B_SUCC];
	stacked_frame	last;

						/* frames encoded (coding order) */
	int		gop_frame_count;	/* .. in current GOP (display order) */
	int		seq_frame_count;	/* .. since last sequence header */

	double		skip_rate_acc;
	double		drop_timeout;
	double		time_per_frame;
	double		frames_per_sec;

	uint8_t *	oldref;			/* past reference frame buffer */

	bool		insert_gop_header;
	bool		closed_gop;		/* random access point, no fwd ref */
	bool		referenced;		/* by other P or B pictures */

	struct rc	rc;

	int		p_succ;
	int		skipped_fake;
	int		skipped_zero;

	uint8_t *	banner;

	consumer	cons;

	int		mb_cx_row;
	int		mb_cx_thresh;

	int		motion_min;
	int		motion_max;

	int		coded_width;
	int		coded_height;

	int		frames_per_seqhdr;

	/* input */

	synchr_stream	sstr;
	double		coded_elapsed;

	/* Output */

	fifo *		fifo;
	producer	prod;
	double		coded_time_elapsed;
	double		coded_frame_rate;
	double		coded_frame_period;

	/* Options */

	rte_codec	codec;

	int		bit_rate;
	int		frame_rate_code;
	double		virtual_frame_rate;
	char *		gop_sequence;
	int		skip_method;
	bool		motion_compensation;
	bool		monochrome;
	char *		anno;
};

extern mpeg1_context vseg;

extern uint8_t * newref;	/* future reference frame buffer */

extern int		mb_col, mb_row,
			mb_width, mb_height,
			mb_last_col, mb_last_row,
			mb_num;

extern short		mblock[7][6][8][8];

extern struct mb_addr {
	struct {
		int		offset;
		int		pitch;
	}		block[6];
	struct {
		int		lum;
		int		chrom;
	}		col, row;
	int		chrom_0;
} mb_address;

#define reset_mba()							\
do {									\
	mb_address.block[0].offset = 0;					\
	mb_address.block[4].offset = mb_address.chrom_0;		\
} while (0)

#define mba_col_incr()							\
do {									\
	mb_address.block[0].offset += mb_address.col.lum;		\
	mb_address.block[4].offset += mb_address.col.chrom;		\
} while (0)

#define mba_row_incr()							\
do {									\
	mb_address.block[0].offset += mb_address.row.lum;		\
	mb_address.block[4].offset += mb_address.row.chrom;		\
} while (0)

#define video_align(n) __attribute__ ((aligned (n)))

extern struct bs_rec	video_out video_align(32);

extern int		dropped;
extern int		(* filter)(unsigned char *, unsigned char *);
extern const char *	filter_labels[];

extern long long	video_frame_count;
extern long long	video_frames_dropped;

// extern fifo *		video_fifo;
// extern pthread_t	video_thread_id;

extern void *		mpeg1_video_ipb(void *capture_fifo);

extern void		conv_init(int);
extern fifo *		v4l_init(double *frame_rate);
extern fifo *		v4l2_init(double *frame_rate);
extern fifo *		file_init(double *frame_rate);
extern void		filter_init(int pitch);
extern void		video_coding_size(int width, int height);
extern int		video_look_ahead(char *gop_sequence);

/* don't change order */
/* XXX rethink */
enum {
	CM_INVALID,
	CM_YUV,
	CM_YUYV,
	CM_YUYV_VERTICAL_DECIMATION,
	CM_YUYV_TEMPORAL_INTERPOLATION,
	CM_YUYV_VERTICAL_INTERPOLATION,
	CM_YUYV_PROGRESSIVE,
	CM_YUYV_PROGRESSIVE_TEMPORAL,
	CM_YUYV_EXP,
	CM_YUYV_EXP_VERTICAL_DECIMATION,
	CM_YUYV_EXP2,
	CM_YVU,
	CM_NUM_MODES
};

#endif /* VIDEO_H */

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: vlc.c,v 1.1 2001/12/04 23:58:10 mswitch Exp $ */

#include <assert.h>
#include <limits.h>
#include "../common/bstream.h"
#include "../common/log.h"
#include "mpeg.h"
#include "vlc.h"

#define align(n) __attribute__ ((SECTION("vlc_tables") aligned (n)))

// XXX
int			dc_dct_pred[2][3];

/*
 *  Tables
 */

VLC2	mp1e_coded_block_pattern[64]		align(CACHE_LINE);
VLC2	mp1e_macroblock_address_increment[33]	align(CACHE_LINE);
VLCM	mp1e_motion_vector_component[480]	align(CACHE_LINE);

/*
 *  ISO/IEC 13818-2 Table B-2
 *  Variable length codes for macroblock_type in I-pictures
 *
 *  '1' Intra
 *  '01 xxxxx' Intra, Quant
 */

/*
 *  ISO/IEC 13818-2 Table B-3
 *  Variable length codes for macroblock_type in P-pictures
 *
 *  '1' MC, Coded
 *  '01' No MC, Coded
 *  '001' MC, Not Coded
 *  '0001 1' Intra
 *  '0001 0' MC, Coded, Quant
 *  '0000 1' No MC, Coded, Quant
 *  '0000 01' Intra, Quant
 */

/*
 *  ISO/IEC 13818-2 Table B-4
 *  Variable length codes for macroblock_type in B-pictures
 */
VLC4
mp1e_macroblock_type_b_nomc_quant[4] align(16) =
{
	{ 0x0020, 11, 0 }, /* '0000 01 xxxxx' (Intra, Quant) */
	{ 0x0183, 13, 2 }, /* '0000 11 xxxxx 11' (Fwd, Coded, Quant, MV (0, 0)) */
	{ 0x0103, 13, 2 }, /* '0000 10 xxxxx 11' (Bwd, Coded, Quant, MV (0, 0)) */
	{ 0x040F, 14, 4 }  /* '0001 0 xxxxx 11 11' (Interp, Coded, Quant, */
			   /*                       FMV (0, 0), BMV (0, 0)) */
};

VLC2
mp1e_macroblock_type_b_nomc[4] align(8) =
{
	{ 0x03, 5 }, /* '0001 1' (Intra) */
	{ 0x0F, 6 }, /* '0011  11' (Fwd, Coded, MV (0, 0)) */
	{ 0x0F, 5 }, /* '011  11' (Bwd, Coded, MV (0, 0)) */
	{ 0x3F, 6 }, /* '11  11 11' (Interp, Coded, FMV (0, 0), BMV (0, 0)) */
};

VLC2
mp1e_macroblock_type_b_nomc_notc[4] align(8) =
{
	{ 0, 0 },    /* Intra always coded */
	{ 0x0B, 6 }, /* '0010  11' (Fwd, Not Coded, MV (0, 0)) */
	{ 0x0B, 5 }, /* '010  11' (Bwd, Not Coded, MV (0, 0)) */
	{ 0x2F, 6 }, /* '10  11 11' (Interp, Not Coded, FMV (0, 0), BMV (0, 0)) */
};

VLC2
mp1e_macroblock_type_b_quant[4] align(8) =
{
	{ 0x020, 11 }, /* '0000 01 xxxxx' (Intra, Quant) */
	{ 0x060, 11 }, /* '0000 11 xxxxx' (Fwd, Coded, Quant) */
	{ 0x040, 11 }, /* '0000 10 xxxxx' (Bwd, Coded, Quant) */
	{ 0x040, 10 }  /* '0001 0 xxxxx' (Interp, Coded, Quant) */
};

#if 0

/* Systematic VLCs */

VLC2
mp1e_macroblock_type_b[4] align(8) =
{
	{ 0x03, 5 }, /* '0001 1' (Intra) */
	{ 0x03, 4 }, /* '0011' (Fwd, Coded) */
	{ 0x03, 3 }, /* '011' (Bwd, Coded) */
	{ 0x03, 2 }, /* '11' (Interp, Coded) */
};

VLC2
mp1e_macroblock_type_b_notc[4] align(8) =
{
	{ 0, 0 },    /* Intra always coded */
	{ 0x02, 4 }, /* '0010' (Fwd, Not Coded) */
	{ 0x02, 3 }, /* '010' (Bwd, Not Coded) */
	{ 0x02, 2 }, /* '10' (Interp, Not Coded) */
};

#endif

unsigned char	mp1e_iscan[8][8]		align(CACHE_LINE);

VLC8		mp1e_dc_vlc_intra[5][12]	align(CACHE_LINE);
VLC2		mp1e_ac_vlc_zero[176]		align(CACHE_LINE);
VLC2		mp1e_ac_vlc_one[176]		align(CACHE_LINE);

// XXX
extern short		mblock[7][6][8][8];
extern struct bs_rec	video_out;
extern const char 	cbp_order[6];

void
mp1e_vlc_init(void)
{
	int i, j;
	unsigned int code;
	int dct_dc_size;
	int run, level, length;
	int f_code;

	/* Variable length codes for macroblock address increment */

	for (i = 0; i < 33; i++) {
		mp1e_macroblock_address_increment[i].length =
			mp1e_vlc(macroblock_address_increment_vlc[i], &code);
		mp1e_macroblock_address_increment[i].code = code;
		assert(code <= UCHAR_MAX);
	}

	/* Variable length codes for coded block pattern */

	for (i = 0; i < 64; i++) {
		int j, k;

		for (j = k = 0; k < 6; k++)
			if (i & (1 << k))
				j |= 0x20 >> cbp_order[k]; /* (5 - k) */

		mp1e_coded_block_pattern[i].length =
			mp1e_vlc(coded_block_pattern_vlc[j], &code);

		mp1e_coded_block_pattern[i].code = code;
		assert(code <= UCHAR_MAX);
	}

	/* Variable length codes for motion vector component */

	for (f_code = F_CODE_MIN; f_code <= F_CODE_MAX; f_code++) {
		int r_size = f_code - 1;
		int f1 = (1 << r_size) - 1;

		for (i = 0; i < 16 << f_code; i++) {
			int motion_code, motion_residual;
			int delta = (i < (16 << r_size)) ? i : i - (16 << f_code);

			motion_code = (abs(delta) + f1) >> r_size;
			motion_residual = (abs(delta) + f1) & f1;

			length = mp1e_vlc(motion_code_vlc[motion_code], &code);

			if (motion_code != 0) {
				code = code * 2 + (delta < 0); /* sign */
				length++;
			}

			if (f_code > 1 && motion_code != 0) {
				code = (code << r_size) + motion_residual;
				length += r_size;
			}

			assert(code < (1 << 12) && length < 16);

			mp1e_motion_vector_component[f1 * 32 + i].code = code;
			mp1e_motion_vector_component[f1 * 32 + i].length = length;
#if 0
			fprintf(stderr, "MV %02x %-2d ", i, delta);

			for (j = length - 1; j >= 0; j--)
				fprintf(stderr, "%d", (code & (1 << j)) > 0);

			fprintf(stderr, "\n");
#endif
		}
	}

	/* Variable length codes for intra DC coefficient */

	for (dct_dc_size = 0; dct_dc_size < 12; dct_dc_size++) {
		/* Intra DC luma VLC */
		mp1e_dc_vlc_intra[0][dct_dc_size].length =
			mp1e_vlc(dct_dc_size_luma_vlc[dct_dc_size], &code)
			+ dct_dc_size;
		mp1e_dc_vlc_intra[0][dct_dc_size].code = code << dct_dc_size;

		/* Intra DC luma VLC with EOB ('10' table B-14) of prev. block */
		mp1e_dc_vlc_intra[1][dct_dc_size].length =
			mp1e_vlc(dct_dc_size_luma_vlc[dct_dc_size], &code)
			+ dct_dc_size + 2;
		mp1e_dc_vlc_intra[1][dct_dc_size].code =
			((0x2 << mp1e_vlc(dct_dc_size_luma_vlc[dct_dc_size],
				     &code)) | code) << dct_dc_size;

		/* Intra DC chroma VLC with EOB of previous block */
		mp1e_dc_vlc_intra[2][dct_dc_size].length =
			mp1e_vlc(dct_dc_size_chroma_vlc[dct_dc_size], &code)
			+ dct_dc_size + 2;
		mp1e_dc_vlc_intra[2][dct_dc_size].code =
			((0x2 << mp1e_vlc(dct_dc_size_chroma_vlc[dct_dc_size],
				     &code)) | code) << dct_dc_size;

		/* Intra DC luma VLC with EOB ('0110' table B-15) of prev. block */
		mp1e_dc_vlc_intra[3][dct_dc_size].length =
			mp1e_vlc(dct_dc_size_luma_vlc[dct_dc_size], &code)
			+ dct_dc_size + 4;
		mp1e_dc_vlc_intra[3][dct_dc_size].code =
			((0x6 << mp1e_vlc(dct_dc_size_luma_vlc[dct_dc_size],
				     &code)) | code) << dct_dc_size;

		/* Intra DC chroma VLC with EOB of previous block */
		mp1e_dc_vlc_intra[4][dct_dc_size].length =
			mp1e_vlc(dct_dc_size_chroma_vlc[dct_dc_size], &code)
			+ dct_dc_size + 4;
		mp1e_dc_vlc_intra[4][dct_dc_size].code =
			((0x6 << mp1e_vlc(dct_dc_size_chroma_vlc[dct_dc_size],
				     &code)) | code) << dct_dc_size;
	}

	/* Variable length codes for AC coefficients (table B-14) */

	for (i = run = 0; run < 64; run++) {
		assert(i <= elements(mp1e_ac_vlc_zero));

		mp1e_ac_vlc_zero[j = i++].code = run;

		for (level = 1;
		     (length = mp1e_dct_coeff_vlc(0, run, level, &code)) > 0;
		     level++, i++) {
			assert(i < elements(mp1e_ac_vlc_zero));
			assert((code << 1) <= UCHAR_MAX);

			mp1e_ac_vlc_zero[i].length = length + 1;
			mp1e_ac_vlc_zero[i].code = code << 1; /* sign 0 */
		}

		mp1e_ac_vlc_zero[j].length = i - j;
	}

	/* Variable length codes for AC coefficients (table B-15) */

	for (i = run = 0; run < 64; run++) {
		assert(i <= elements(mp1e_ac_vlc_one));

		mp1e_ac_vlc_one[j = i++].code = run;

		for (level = 1;
		     (length = mp1e_dct_coeff_vlc(1, run, level, &code)) > 0;
		     level++, i++) {
			assert(i < elements(mp1e_ac_vlc_one));
			assert((code << 0) <= UCHAR_MAX);

			mp1e_ac_vlc_one[i].length = length + 1;
			mp1e_ac_vlc_one[i].code = code << 0;
			/* no sign (would need 9 bits) */
		}

		mp1e_ac_vlc_zero[j].length = i - j;
	}

	/*
	 *  Forward zig-zag scanning pattern
	 */
	for (i = 0; i < 64; i++) {
		/* iscan[0][63 - scan[0][0][i]] = (i & 7) * 8 + (i >> 3); */
		mp1e_iscan[0][(scan[0][0][i] - 1) & 63] =
			(i & 7) * 8 + (i >> 3);
	}
}

/* Reference */

#if 0

int
mp1e_mpeg1_encode_intra(void)
{
	int v;

	int
	encode_block(short block[8][8], int *dc_pred, VLC8 *dc_vlc)
	{
		/* DC coefficient */

		{
			register int val = block[0][0] - *dc_pred, size;

			/*
			 *  Find first set bit, starting at msb with 0 -> 0.
			 */
			asm volatile (
				" bsrl		%1,%0\n"
				" jnz		1f\n"
				" movl		$-1,%0\n"
		    	        "1:\n"
				" incl		%0\n"
			: "=&r" (size) : "r" (abs(val)));

			if (val < 0) {
				val--;
				val ^= (-1 << size);
			}

			bputl(&video_out, dc_vlc[size].code | val, dc_vlc[size].length);

			*dc_pred = block[0][0];
		}

		/* AC coefficients */

		{
			VLC2 *p = ac_vlc_zero;
			int i;

			for (i = 1; i < 64; i++) {
	    			int ulevel, slevel = block[0][iscan[0][(i - 1) & 63]];

				if (slevel) {
					ulevel = abs(slevel);

		    			if (ulevel < (int) p->length) {
						p += ulevel;
						bputl(&video_out, p->code | ((slevel >> 31) & 1), p->length);
					} else {
		    				int len;

		    				if (slevel > 127) {
							if (slevel > 255)
								return 1;
							/* %000001 escape, 6 bit run, %00000000, slevel & 0xFF */
							slevel = 0x0400000 | (p->code << 16) | (slevel & 0xFF);
							len = 28;
						} else if (slevel < -127) {
							if (slevel < -255)
								return 1;
							/* %000001 escape, 6 bit run, %10000000, slevel (sic) & 0xFF */
							slevel = 0x0408000 | (p->code << 16) | (slevel & 0xFF);
							len = 28;
						} else {
							/* %000001 escape, 6 bit run, slevel & 0xFF */
							slevel = (1 << 14) | (p->code << 8) | (slevel & 0xFF);
							len = 20;
						}

						bputl(&video_out, slevel, len);
					}

					p = ac_vlc_zero; /* run = 0 */
				} else
					p += p->length; /* run++ */
			}
		}

		return 0;
	}

	dc_dct_pred[1][0] = dc_dct_pred[0][0];
	dc_dct_pred[1][1] = dc_dct_pred[0][1];
	dc_dct_pred[1][2] = dc_dct_pred[0][2];

	v  = encode_block(mblock[1][0], &dc_dct_pred[0][0], dc_vlc_intra[0]);
	v |= encode_block(mblock[1][2], &dc_dct_pred[0][0], dc_vlc_intra[1]);
	v |= encode_block(mblock[1][1], &dc_dct_pred[0][0], dc_vlc_intra[1]);
	v |= encode_block(mblock[1][3], &dc_dct_pred[0][0], dc_vlc_intra[1]);
	v |= encode_block(mblock[1][4], &dc_dct_pred[0][1], dc_vlc_intra[2]);
	v |= encode_block(mblock[1][5], &dc_dct_pred[0][2], dc_vlc_intra[2]);

	bputl(&video_out, 0x2, 2); /* EOB '10' (ISO 13818-2 table B-14) */

	/*
	 *  Saturation is rarely needed, so the forward quantisation code
	 *  skips the step. This routine detects excursions in uncritical
	 *  path and reports but saturates because saturation often causes
	 *  a visibly annoying reconstruction error.
	 */
	if (v) {
		dc_dct_pred[0][0] = dc_dct_pred[1][0];
		dc_dct_pred[0][1] = dc_dct_pred[1][1];
		dc_dct_pred[0][2] = dc_dct_pred[1][2];
	}

	return v;
}

int
mp1e_mpeg1_encode_inter(short iblock[6][8][8], unsigned int cbp)
{
	int v = 0;

	int
	encode_block(short block[8][8])
	{
		VLC2 *p = ac_vlc_zero; /* ISO 13818-2 table B-14 */
    		int i = 1, len, ulevel, slevel;

		/* DC coefficient */

		ulevel = abs(slevel = block[0][0]);

		if (ulevel == 1) {
			bputl(&video_out, 0x2 | ((slevel >> 31) & 1), 2);
		} else
			i = 0;

		/* AC coefficients */

		while (i < 64) {
	    		if ((slevel = block[0][iscan[0][(i - 1) & 63]])) {
				ulevel = abs(slevel);

		    		if (ulevel < (int) p->length) {
					p += ulevel;
					bputl(&video_out, p->code | ((slevel >> 31) & 1), p->length);
				} else {
		    			if (slevel > 127) {
						if (slevel > 255)
							return 1;
						/* %000001 escape, 6 bit run, %00000000, slevel & 0xFF */
						slevel = 0x0400000 | (p->code << 16) | (slevel & 0xFF);
						len = 28;
					} else if (slevel < -127) {
						if (slevel < -255)
							return 1;
						/* %000001 escape, 6 bit run, %10000000, slevel (sic) & 0xFF */
						slevel = 0x0408000 | (p->code << 16) | (slevel & 0xFF);
						len = 28;
					} else {
						/* %000001 escape, 6 bit run, slevel & 0xFF */
						slevel = (1 << 14) | (p->code << 8) | (slevel & 0xFF);
						len = 20;
					}

					bputl(&video_out, slevel, len);
				}

				p = ac_vlc_zero; /* run = 0 */
			} else
			        p += p->length; /* run++ */
			i++;
		}

		bputl(&video_out, 0x2, 2);
		return 0;
	}

	/* watch cbp_order */
	if (cbp & (1 << 5)) v  = encode_block(iblock[0]);
	if (cbp & (1 << 3)) v |= encode_block(iblock[2]);
	if (cbp & (1 << 4)) v |= encode_block(iblock[1]);
	if (cbp & (1 << 2)) v |= encode_block(iblock[3]);
	if (cbp & (1 << 1)) v |= encode_block(iblock[4]);
	if (cbp & (1 << 0)) v |= encode_block(iblock[5]);

	return v;
}

#endif

--- NEW FILE ---
/*
 *  MPEG-1 Real Time Encoder
 *
 *  Copyright (C) 1999-2000 Michael H. Schimek
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* $Id: vlc.h,v 1.1 2001/12/04 23:58:10 mswitch Exp $ */

#ifndef VLC_H
#define VLC_H

#include "../common/math.h"
#include "video.h"

typedef struct {
	unsigned char		code;
	unsigned char		length;
} VLC2;

typedef struct {
	unsigned 		code : 12;
	unsigned 		length : 4;
} VLCM;

typedef struct {
	unsigned short		code;
	unsigned char		length, mv_length;
} VLC4;

typedef struct {
	unsigned int		code;
	unsigned int		length;
} VLC8;

extern int		dc_dct_pred[2][3];

extern VLC2		mp1e_coded_block_pattern[64];
extern VLC2		mp1e_macroblock_address_increment[33];
extern VLCM		mp1e_motion_vector_component[480];
extern VLC4		mp1e_macroblock_type_b_nomc_quant[4];
extern VLC2		mp1e_macroblock_type_b_nomc[4];
extern VLC2		mp1e_macroblock_type_b_nomc_notc[4];
extern VLC2		mp1e_macroblock_type_b_quant[4];
extern unsigned char	mp1e_iscan[8][8];
extern VLC8		mp1e_dc_vlc_intra[5][12];
extern VLC2		mp1e_ac_vlc_zero[176];
extern VLC2		mp1e_ac_vlc_one[176];

extern void		mp1e_vlc_init(void);

extern int		mp1e_mpeg1_encode_intra(void);
extern int		mp1e_mpeg1_encode_inter(short mblock[6][8][8],
						unsigned int cbp);
extern int		mp1e_mpeg2_encode_intra(void);
extern int		mp1e_mpeg2_encode_inter(short mblock[6][8][8],
						unsigned int cbp);

extern int		mp1e_p6_mpeg1_encode_intra(void);
extern int		mp1e_p6_mpeg1_encode_inter(short mblock[6][8][8],
						   unsigned int cbp);

static inline
void reset_dct_pred(void)
{
	dc_dct_pred[0][0] = 0;
	dc_dct_pred[0][1] = 0;
	dc_dct_pred[0][2] = 0;
}

#define F_CODE_MIN 1
#define F_CODE_MAX 4

struct motion {
	VLCM *			vlc;
	int			f_code;
	int			f_mask;
	int			src_range;
	int			max_range;

	int			PMV[2], MV[2];
};

static inline void
motion_init(mpeg1_context *mpeg1, struct motion *m, int range)
{
	int f;

	range = saturate(range, mpeg1->motion_min, mpeg1->motion_max);
	f = saturate(ffsr(range - 1) - 1, F_CODE_MIN, F_CODE_MAX);
	m->max_range = 4 << f;
	m->src_range = saturate(range, 4, 4 << f);
	m->f_mask = 0xFF >> (4 - f);
	m->f_code = f;

	m->vlc = mp1e_motion_vector_component + ((15 << f) & 480);
	// = mp1e_motion_vector_component + ((1 << (f - 1)) - 1) * 32;

	m->PMV[0] = 0;
	m->PMV[1] = 0;
}

#endif /* VLC_H */

--- NEW FILE ---
#
#  MPEG-1 Real Time Encoder
# 
#  Copyright (C) 1999-2000 Michael H. Schimek
# 
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License version 2 as
#  published by the Free Software Foundation.
# 
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
# 
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#

# $Id: vlc_mmx.s,v 1.1 2001/12/04 23:58:10 mswitch Exp $

# int
# p6_mpeg1_encode_intra(void)

	.text
	.align		16
	.globl		mp1e_p6_mpeg1_encode_intra

mp1e_p6_mpeg1_encode_intra:

	pushl		%ebp;
	pushl		%edi;
	pushl		%edx;
	leal		mp1e_dc_vlc_intra,%edi;
	pushl		%esi;
	leal		mblock+0*128+768,%esi;
	pushl		%ebx;
	movl		video_out,%ebp;
	pushl		%ecx;
	movl		dc_dct_pred,%ebx;
	call		1f;
	movswl		mblock+0*128+768,%ebx;
	leal		mblock+2*128+768,%esi;
	leal		mp1e_dc_vlc_intra+12*8,%edi;
	call		1f;
	movswl		mblock+2*128+768,%ebx;
	leal		mblock+1*128+768,%esi;
	leal		mp1e_dc_vlc_intra+12*8,%edi;
	call		1f;
	movswl		mblock+1*128+768,%ebx;
	leal		mblock+3*128+768,%esi;
	leal		mp1e_dc_vlc_intra+12*8,%edi;
	call		1f;
	movl		dc_dct_pred+4,%ebx;
	leal		mblock+4*128+768,%esi;
	leal		mp1e_dc_vlc_intra+24*8,%edi;
	call		1f;
	movl		dc_dct_pred+8,%ebx;
	leal		mblock+5*128+768,%esi;
	leal		mp1e_dc_vlc_intra+24*8,%edi;
	call		1f;
	movswl		mblock+3*128+768,%eax;
	movswl		mblock+4*128+768,%ebx;
	movl		%eax,dc_dct_pred;
	movswl		mblock+5*128+768,%ecx;
	movl		%ebx,dc_dct_pred+4;
	movl		%ecx,dc_dct_pred+8;
	movl		%ebp,video_out;
	movl		$video_out,%eax;
	movl		$2,%ecx;
	movl		$2,%edx;
	call		mmx_bputl;
	movl		(%esp),%ecx;
	movl		4(%esp),%ebx;
	movl		8(%esp),%esi;
	xorl		%eax,%eax;
	movl		12(%esp),%edx;
	movl		16(%esp),%edi;
	movl		20(%esp),%ebp;
	leal		24(%esp),%esp;
	ret;

	.align	16

1:	movd		%esp,%mm6;
	movl		$0,%ecx;
	movswl		(%esi),%eax;			
	subl		%ebx,%eax;
	movl		%eax,%ebx;
	cdq;
	xorl		%edx,%eax;
	subl		%edx,%eax;			
	bsrl		%eax,%ecx;
	setnz		%al;
	addl		%edx,%ebx;
	movl		$-63,%esp;
	addb		%al,%cl;
	sall		%cl,%edx;
	xorl		%edx,%ebx;			
	orl		(%edi,%ecx,8),%ebx;
	addl		4(%edi,%ecx,8),%ebp;
	jmp		4f;

	.align 16

2:	movswl		(%esi,%ebx,2),%eax;		
	movzbl		1(%edi),%ecx;
	testl		%eax,%eax;			
	jne		3f;
	movzbl		mp1e_iscan+63(%esp),%ebx;		
	incl		%esp;
	leal		(%edi,%ecx,2),%edi;
	jle		2b;
	movd		%mm6,%esp;
	ret;

3:	cdq;
	xorl		%edx,%eax;
	subl		%edx,%eax;			
	cmpl		%ecx,%eax;			
	jge		5f;
	movzbl		(%edi,%eax,2),%ebx;
	movzbl		1(%edi,%eax,2),%ecx;			
	subl		%edx,%ebx;
	addl		%ecx,%ebp;
4:	movl		$64,%edi;
	movd		%ebx,%mm2;			
	subl		%ebp,%edi;
	movd		%edi,%mm1;			
	jle		7f;
	leal		mp1e_ac_vlc_zero,%edi;
	psllq		%mm1,%mm2;
	movzbl		mp1e_iscan+63(%esp),%ebx;		
	incl		%esp;			
	por		%mm2,%mm7;
	jle		2b;
	movd		%mm6,%esp;
	ret;

5:	movzbl		(%edi),%ecx;			
	movswl		(%esi,%ebx,2),%edx;		
	cmpl		$127,%eax;
	jg		6f;
	andl		$255,%edx;			
	sall		$8,%ecx;
	leal		16384(%ecx,%edx),%ebx;
	addl		$20,%ebp;
	jmp		4b;

6:	sall		$16,%ecx;			
	andl		$33023,%edx;			
	cmpl		$255,%eax;			
	leal		4194304(%ecx,%edx),%ebx;
	addl		$28,%ebp;
	jle		4b;

	movd		%mm6,%esp;
	addl		$4,%esp;
	movl		$1,%eax;
	popl		%ecx;				
	popl		%ebx;
	popl		%esi;				
	popl		%edx;
	popl		%edi;				
	popl		%ebp;
	ret;

	.align 16

7:	movq		video_out+16,%mm3;		
	movq		%mm2,%mm5;
	leal		mp1e_ac_vlc_zero,%edi;		
	pxor		%mm4,%mm4;
	psubd		%mm1,%mm4;
	movd		%mm4,%ebp;			
	psubd		%mm4,%mm3;			
	psrld		%mm4,%mm5;
	movl		video_out+4,%ecx;
	por		%mm5,%mm7;			
	movd		%mm7,%eax;			
	movzbl		mp1e_iscan+63(%esp),%ebx;		
	psrlq		$32,%mm7;
	bswap		%eax;
	leal		8(%ecx),%edx;
	movl		%eax,4(%ecx);
	movd		%mm7,%eax;			
	bswap		%eax;
	psllq		%mm3,%mm2;
	incl		%esp;
	movq		%mm2,%mm7;
	movl		%eax,(%ecx);			
	movl		%edx,video_out+4;		
	jle		2b;
	movd		%mm6,%esp;
	ret;

# int
# p6_mpeg1_encode_inter(short mblock[6][8][8], unsigned int cbp)

	.text
	.align		16
	.globl		mp1e_p6_mpeg1_encode_inter

mp1e_p6_mpeg1_encode_inter:

	testl		$32,1*4+4(%esp);
	pushl		%esi
	movl		2*4+0(%esp),%esi;
	pushl		%ebp
	pushl		%edi
	pushl		%ebx
	je		2f;
	call		1f;
	movl		5*4+0(%esp),%esi;
2:	testl		$8,5*4+4(%esp);
	je		2f;
	leal		2*128(%esi),%esi;
	call		1f;
	movl		5*4+0(%esp),%esi;
2:	testl		$16,5*4+4(%esp);
	je		2f;
	leal		1*128(%esi),%esi;
	call		1f;
	movl		5*4+0(%esp),%esi;
2:	testl		$4,5*4+4(%esp);
	je		2f;
	leal		3*128(%esi),%esi;
	call		1f;
	movl		5*4+0(%esp),%esi;
2:	testl		$2,5*4+4(%esp);
	je		2f;
	leal		4*128(%esi),%esi;
	call		1f;
	movl		5*4+0(%esp),%esi;
2:	testl		$1,5*4+4(%esp);
	je		2f;
	leal		5*128(%esi),%esi;
	call		1f;
2:
	xorl		%eax,%eax
	popl		%ebx
	popl		%edi
	popl		%ebp
	popl		%esi
	ret

	.align	16

1:	movswl		(%esi),%eax;
	movl		$0,%ebp;
	movd		%esp,%mm6;	
	movl		video_out,%ebx;
	movl		$-63,%esp;
	leal		mp1e_ac_vlc_zero,%edi;		
	cdq;
	xorl		%edx,%eax;			
	subl		%edx,%eax;
	decl		%eax;
	jne		3f
	movl		$2,%ebp;
	subl		%edx,%ebp;
	addl		$2,%ebx;
	jmp		9f;

	.align 16

3:	movswl		(%esi,%ebp,2),%eax;		
	testl		%eax,%eax;			
	movzbl		1(%edi),%ecx;
	jne		4f;
	movzbl		mp1e_iscan+63(%esp),%ebp;		
	incl		%esp;
	leal		(%edi,%ecx,2),%edi;		
	jle		3b;

0:	movl		%ebx,video_out;
	movd		%mm6,%esp;
	movl		$video_out,%eax;
	movl		$2,%ecx;
	movl		$2,%edx;		
	jmp		mmx_bputl;

4:	cdq;
	xorl		%edx,%eax;
	subl		%edx,%eax;
	cmpl		%ecx,%eax;			
	jge		5f;
	movzbl		(%edi,%eax,2),%ebp;
	addb		1(%edi,%eax,2),%bl;
	subl		%edx,%ebp;
9:	movl		$64,%edi;
	movd		%ebp,%mm2;
	subl		%ebx,%edi;
	movd		%edi,%mm1;		
	jle		8f;
	leal		mp1e_ac_vlc_zero,%edi;
	movzbl		mp1e_iscan+63(%esp),%ebp;		
	psllq		%mm1,%mm2;
	incl		%esp;
	por		%mm2,%mm7;
	jle 		3b;
	jmp		0b;

	.align 16

5:	movswl		(%esi,%ebp,2),%ebp;
	movzbl		(%edi),%ecx;
	cmpl		$127,%eax;
	jg		6f;
	sall		$8,%ecx;
	andl		$255,%ebp;			
	leal		16384(%ecx,%ebp),%ebp;		
	addb		$20,%bl;
	jmp		9b;

6:	cmpl		$255,%eax;
	sall		$16,%ecx;			
	andl		$33023,%ebp;			
	leal		4194304(%ecx,%ebp),%ebp;	
	addb		$28,%bl;
	jle		9b;

	movd		%mm6,%esp;
	addl		$4,%esp;
	popl		%ebx;
	popl		%edi;
	movl		$1,%eax;
	popl		%ebp;
	popl		%esi;
	ret;

	.align 16

8:	leal		mp1e_ac_vlc_zero,%edi;		
	movq		video_out+16,%mm3;		
	movq		%mm2,%mm5;
	pxor		%mm4,%mm4;
	psubd		%mm1,%mm4;
	movd		%mm4,%ebx;			
	psrld		%mm4,%mm5;
	movl		video_out+4,%ecx;
	por		%mm5,%mm7;			
	psubd		%mm4,%mm3;			
	movzbl		mp1e_iscan+63(%esp),%ebp;
	movd		%mm7,%eax;			
	psrlq		$32,%mm7;
	psllq		%mm3,%mm2;
	leal		8(%ecx),%edx;
	bswap		%eax;
	movl		%eax,4(%ecx);
	movd		%mm7,%eax;			
	incl		%esp;
	movl		%edx,video_out+4;		
	movq		%mm2,%mm7;
	bswap		%eax;
	movl		%eax,(%ecx);			
	jle		3b;
	jmp		0b;