[Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations

Fri Mar 10 18:41:36 CET 2006

On Thu, Mar 09, 2006 at 07:14:24PM -0500, Robert Edele wrote:
> On Thu, 2006-03-09 at 15:23 -0800, Loren Merritt wrote:
> > On Thu, 9 Mar 2006, Robert Edele wrote:
> > 
> > > I've removed interleave_line_footer which is never invoked, as it is not
> > > used by the asm cases and Michael's code is used for the C case.
> > >
> > > Michael, I believe that the patch is ready to be reviewed and hopefully
> > > committed. There might be a small bug or two left, but the code looks
> > > good to my eyes and has been tested on AMD-64, P4, and AMD-32 systems
> > > with no issues (no crashes, and md5sums all agree). Regression tests on
> > > my machine all pass (C, mmx, sse2).
> > >
> > > If there are any more remaining issues with the code, please let me
> > > know. Thanks for your patience.
> > 
> > >+ if (!(b_h & 1))
> > >+     inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, 
> > >+         block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
> > >+ else
> > >+     inner_add_yblock_bw_8_obmc_16_bh_even_mmx(obmc, obmc_stride, 
> > >+         block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
> > 
> > inner_add_yblock_bw_8_obmc_16_bh_even_mmx is called when bh is odd?
> > 
> > --Loren Merritt
> 
> That must have been an oversight by Oded (ods15) while helping me out in
> porting the code out of snow.c and renaming that function after reading
> the corresponding sse2 function (which does require even b_h). The mmx
> version handles either odd or even just fine and it was never written
> with the word 'even' in it. On the plus side, he is the one that thought
> of using the mmx as a fallback should b_h be odd instead of falling back
> on the C version.

Heh, I didn't think of anything, I just copied the behavior it was when you 
sent me the non dsputil patch...

On a completely different note, I have optimized mc_block . The C version 
is 30-50% faster, and the MMX version is 70% faster or so. qpel is no 
longer expensive in snow... I merged the 2 patches here... md5sum checks 
out. I was worried about odd resolutions, but it seems Snow has stopped 
supporting any resolution not divisable by 8...

- ods15
-------------- next part --------------
Index: libavcodec/snow.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/snow.c,v
retrieving revision 1.87
diff -u -r1.87 snow.c

--- libavcodec/snow.c	30 Jan 2006 23:33:18 -0000	1.87
+++ libavcodec/snow.c	10 Mar 2006 16:36:15 -0000
@@ -19,23 +19,15 @@
 #include "avcodec.h"
 #include "common.h"
 #include "dsputil.h"
+#include "snow.h"
 
 #include "rangecoder.h"
-#define MID_STATE 128
 
 #include "mpegvideo.h"
 
 #undef NDEBUG
 #include <assert.h>
 
-#define MAX_DECOMPOSITIONS 8
-#define MAX_PLANES 4
-#define DWTELEM int
-#define QSHIFT 5
-#define QROOT (1<<QSHIFT)
-#define LOSSLESS_QLOG -128
-#define FRAC_BITS 8
-
 static const int8_t quant3[256]={
  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -181,7 +173,7 @@
 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 };
 
-#define LOG2_OBMC_MAX 6
+#define LOG2_OBMC_MAX 8
 #define OBMC_MAX (1<<(LOG2_OBMC_MAX))
 #if 0 //64*cubic
 static const uint8_t obmc32[1024]={
@@ -240,6 +232,39 @@
 };
 #elif 1 // 64*linear
 static const uint8_t obmc32[1024]={
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+/*
  0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
  0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
  0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
@@ -272,9 +297,27 @@
  0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
  0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
  0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+*/
  //error:0.000020
 };
 static const uint8_t obmc16[256]={
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+/*
  0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
  1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
  1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
@@ -291,6 +334,7 @@
  1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
  1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
  0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
+*/
 //error:0.000015
 };
 #else //64*cos
@@ -352,6 +396,15 @@
 
 //linear *64
 static const uint8_t obmc8[64]={
+  4, 12, 20, 28, 28, 20, 12,  4,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+ 20, 60,100,140,140,100, 60, 20,
+ 28, 84,140,196,196,140, 84, 28,
+ 28, 84,140,196,196,140, 84, 28,
+ 20, 60,100,140,140,100, 60, 20,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+  4, 12, 20, 28, 28, 20, 12,  4,
+/*
  1, 3, 5, 7, 7, 5, 3, 1,
  3, 9,15,21,21,15, 9, 3,
  5,15,25,35,35,25,15, 5,
@@ -360,15 +413,22 @@
  5,15,25,35,35,25,15, 5,
  3, 9,15,21,21,15, 9, 3,
  1, 3, 5, 7, 7, 5, 3, 1,
+*/
 //error:0.000000
 };
 
 //linear *64
 static const uint8_t obmc4[16]={
+ 16, 48, 48, 16,
+ 48,144,144, 48,
+ 48,144,144, 48,
+ 16, 48, 48, 16,
+ /*
  4,12,12, 4,
 12,36,36,12,
 12,36,36,12,
  4,12,12, 4,
+ */
 //error:0.000000
 };
 
@@ -425,17 +485,6 @@
     SubBand band[MAX_DECOMPOSITIONS][4];
 }Plane;
 
-/** Used to minimize the amount of memory used in order to optimize cache performance. **/
-typedef struct {
-    DWTELEM * * line; ///< For use by idwt and predict_slices.
-    DWTELEM * * data_stack; ///< Used for internal purposes.
-    int data_stack_top;
-    int line_count;
-    int line_width;
-    int data_count;
-    DWTELEM * base_buffer; ///< Buffer that this structure is caching.
-} slice_buffer;
-
 typedef struct SnowContext{
 //    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
 
@@ -741,6 +790,7 @@
     }
 }
 
+#ifndef lift5
 static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
     const int mirror_left= !highpass;
     const int mirror_right= (width&1) ^ highpass;
@@ -770,7 +820,9 @@
         dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
     }
 }
+#endif
 
+#ifndef liftS
 static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
     const int mirror_left= !highpass;
     const int mirror_right= (width&1) ^ highpass;
@@ -793,6 +845,7 @@
         dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
     }
 }
+#endif
 
 
 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
@@ -1111,76 +1164,6 @@
     }
 }
 
-#define liftS lift
-#define lift5 lift
-#if 1
-#define W_AM 3
-#define W_AO 0
-#define W_AS 1
-
-#undef liftS
-#define W_BM 1
-#define W_BO 8
-#define W_BS 4
-
-#define W_CM 1
-#define W_CO 0
-#define W_CS 0
-
-#define W_DM 3
-#define W_DO 4
-#define W_DS 3
-#elif 0
-#define W_AM 55
-#define W_AO 16
-#define W_AS 5
-
-#define W_BM 3
-#define W_BO 32
-#define W_BS 6
-
-#define W_CM 127
-#define W_CO 64
-#define W_CS 7
-
-#define W_DM 7
-#define W_DO 8
-#define W_DS 4
-#elif 0
-#define W_AM 97
-#define W_AO 32
-#define W_AS 6
-
-#define W_BM 63
-#define W_BO 512
-#define W_BS 10
-
-#define W_CM 13
-#define W_CO 8
-#define W_CS 4
-
-#define W_DM 15
-#define W_DO 16
-#define W_DS 5
-
-#else
-
-#define W_AM 203
-#define W_AO 64
-#define W_AS 7
-
-#define W_BM 217
-#define W_BO 2048
-#define W_BS 12
-
-#define W_CM 113
-#define W_CO 64
-#define W_CS 7
-
-#define W_DM 227
-#define W_DO 128
-#define W_DS 9
-#endif
 static void horizontal_decompose97i(DWTELEM *b, int width){
     DWTELEM temp[width];
     const int w2= (width+1)>>1;
@@ -1410,7 +1393,7 @@
 }
 
 
-static void horizontal_compose97i(DWTELEM *b, int width){
+void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
     DWTELEM temp[width];
     const int w2= (width+1)>>1;
 
@@ -1463,7 +1446,7 @@
     }
 }
 
-static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
+void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
     int i;
 
     for(i=0; i<width; i++){
@@ -1504,7 +1487,7 @@
     cs->y = -3;
 }
 
-static void spatial_compose97i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
+static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
     int y = cs->y;
 
     DWTELEM *b0= cs->b0;
@@ -1516,7 +1499,7 @@
 
 {START_TIMER
     if(y>0 && y+4<height){
-        vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
+        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
     }else{
         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
@@ -1527,8 +1510,8 @@
 STOP_TIMER("vertical_compose97i")}}
 
 {START_TIMER
-        if(y-1<(unsigned)height) horizontal_compose97i(b0, width);
-        if(y+0<(unsigned)height) horizontal_compose97i(b1, width);
+        if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
+        if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
 if(width>400 && y+0<(unsigned)height){
 STOP_TIMER("horizontal_compose97i")}}
 
@@ -1557,8 +1540,8 @@
 STOP_TIMER("vertical_compose97i")}}
 
 {START_TIMER
-        if(y-1<(unsigned)height) horizontal_compose97i(b0, width);
-        if(y+0<(unsigned)height) horizontal_compose97i(b1, width);
+        if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
+        if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
 if(width>400 && b0 <= b2){
 STOP_TIMER("horizontal_compose97i")}}
 
@@ -1619,7 +1602,7 @@
     }
 }
 
-static void ff_spatial_idwt_buffered_slice(dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
+static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
     const int support = type==1 ? 3 : 5;
     int level;
     if(type==2) return;
@@ -1627,7 +1610,7 @@
     for(level=decomposition_count-1; level>=0; level--){
         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
             switch(type){
-            case 0: spatial_compose97i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
+            case 0: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
                     break;
             case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
                     break;
@@ -2381,91 +2364,6 @@
     }
 }
 
-static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
-    int x, y;
-START_TIMER
-    for(y=0; y < b_h+5; y++){
-        for(x=0; x < b_w; x++){
-            int a0= src[x    ];
-            int a1= src[x + 1];
-            int a2= src[x + 2];
-            int a3= src[x + 3];
-            int a4= src[x + 4];
-            int a5= src[x + 5];
-//            int am= 9*(a1+a2) - (a0+a3);
-            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
-//            int am= 18*(a2+a3) - 2*(a1+a4);
-//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
-//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
-
-//            if(b_w==16) am= 8*(a1+a2);
-
-            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
-            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
-
-            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
-            if(am&(~255)) am= ~(am>>31);
-
-            tmp[x] = am;
-
-/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
-            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
-            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
-            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
-        }
-        tmp += stride;
-        src += stride;
-    }
-    tmp -= (b_h+5)*stride;
-
-    for(y=0; y < b_h; y++){
-        for(x=0; x < b_w; x++){
-            int a0= tmp[x + 0*stride];
-            int a1= tmp[x + 1*stride];
-            int a2= tmp[x + 2*stride];
-            int a3= tmp[x + 3*stride];
-            int a4= tmp[x + 4*stride];
-            int a5= tmp[x + 5*stride];
-            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
-//            int am= 18*(a2+a3) - 2*(a1+a4);
-/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
-            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
-
-//            if(b_w==16) am= 8*(a1+a2);
-
-            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
-            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
-
-            if(am&(~255)) am= ~(am>>31);
-
-            dst[x] = am;
-/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
-            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
-            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
-            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
-        }
-        dst += stride;
-        tmp += stride;
-    }
-STOP_TIMER("mc_block")
-}
-
-#define mca(dx,dy,b_w)\
-static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
-    uint8_t tmp[stride*(b_w+5)];\
-    assert(h==b_w);\
-    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
-}
-
-mca( 0, 0,16)
-mca( 8, 0,16)
-mca( 0, 8,16)
-mca( 8, 8,16)
-mca( 0, 0,8)
-mca( 8, 0,8)
-mca( 0, 8,8)
-mca( 8, 8,8)
-
 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
     if(block->type & BLOCK_INTRA){
         int x, y;
@@ -2524,9 +2422,14 @@
         assert(!(b_w&(b_w-1)));
         assert(b_w>1 && b_h>1);
         assert(tab_index>=0 && tab_index<4 || b_w==32);
-        if((dx&3) || (dy&3))
-            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
-        else if(b_w==32){
+        if((dx&3) || (dy&3)) {
+            START_TIMER
+            assert(!(dx&1) && !(dy&1));
+            assert(dx<16 && dy<16);
+            s->dsp.mc_block_x[dx>>1](dst, src, tmp, stride, b_w, b_h, dx, dy);
+            s->dsp.mc_block_y[dy>>1](dst, src, tmp, stride, b_w, b_h, dx, dy);
+            STOP_TIMER("mc_block")
+        } else if(b_w==32){
             int y;
             for(y=0; y<b_h; y+=16){
                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
@@ -2545,6 +2448,40 @@
     }
 }
 
+void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    int y, x;
+    DWTELEM * dst;
+    for(y=0; y<b_h; y++){
+        //FIXME ugly missue of obmc_stride
+        uint8_t *obmc1= obmc + y*obmc_stride;
+        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+        dst = slice_buffer_get_line(sb, src_y + y);
+        for(x=0; x<b_w; x++){
+            int v=   obmc1[x] * block[3][x + y*src_stride]
+                    +obmc2[x] * block[2][x + y*src_stride]
+                    +obmc3[x] * block[1][x + y*src_stride]
+                    +obmc4[x] * block[0][x + y*src_stride];
+
+            v <<= 8 - LOG2_OBMC_MAX;
+            if(FRAC_BITS != 8){
+                v += 1<<(7 - FRAC_BITS);
+                v >>= 8 - FRAC_BITS;
+            }
+            if(add){
+                v += dst[x + src_x];
+                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                if(v&(~255)) v= ~(v>>31);
+                dst8[x + y*src_stride] = v;
+            }else{
+                dst[x + src_x] -= v;
+            }
+        }
+    }
+}
+
 //FIXME name clenup (b_w, block_w, b_width stuff)
 static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
     DWTELEM * dst = NULL;
@@ -2669,36 +2606,7 @@
 
     START_TIMER
 
-    for(y=0; y<b_h; y++){
-        //FIXME ugly missue of obmc_stride
-        uint8_t *obmc1= obmc + y*obmc_stride;
-        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
-        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
-        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
-        dst = slice_buffer_get_line(sb, src_y + y);
-        for(x=0; x<b_w; x++){
-            int v=   obmc1[x] * block[3][x + y*src_stride]
-                    +obmc2[x] * block[2][x + y*src_stride]
-                    +obmc3[x] * block[1][x + y*src_stride]
-                    +obmc4[x] * block[0][x + y*src_stride];
-
-            v <<= 8 - LOG2_OBMC_MAX;
-            if(FRAC_BITS != 8){
-                v += 1<<(7 - FRAC_BITS);
-                v >>= 8 - FRAC_BITS;
-            }
-            if(add){
-//                v += old_dst[x + y*dst_stride];
-                v += dst[x + src_x];
-                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
-                if(v&(~255)) v= ~(v>>31);
-                dst8[x + y*src_stride] = v;
-            }else{
-//                old_dst[x + y*dst_stride] -= v;
-                dst[x + src_x] -= v;
-            }
-        }
-    }
+    s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
         STOP_TIMER("Inner add y block")
 }
 #endif
@@ -3044,7 +2952,7 @@
     }
     *b= backup;
 
-    return clip(((ab<<6) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
+    return clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
 }
 
 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
@@ -3104,10 +3012,10 @@
     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
     int sx= block_w*mb_x - block_w/2;
     int sy= block_w*mb_y - block_w/2;
-    const int x0= FFMAX(0,-sx);
-    const int y0= FFMAX(0,-sy);
-    const int x1= FFMIN(block_w*2, w-sx);
-    const int y1= FFMIN(block_w*2, h-sy);
+    int x0= FFMAX(0,-sx);
+    int y0= FFMAX(0,-sy);
+    int x1= FFMIN(block_w*2, w-sx);
+    int y1= FFMIN(block_w*2, h-sy);
     int i,x,y;
 
     pred_block(s, cur, ref, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
@@ -3125,6 +3033,22 @@
         }
     }
 
+    /* copy the regions where obmc[] = (uint8_t)256 */
+    if(LOG2_OBMC_MAX == 8
+        && (mb_x == 0 || mb_x == b_stride-1)
+        && (mb_y == 0 || mb_y == b_height-1)){
+        if(mb_x == 0)
+            x1 = block_w;
+        else
+            x0 = block_w;
+        if(mb_y == 0)
+            y1 = block_w;
+        else
+            y0 = block_w;
+        for(y=y0; y<y1; y++)
+            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
+    }
+
     //FIXME sad/ssd can be broken up, but wavelet cmp should be one 32x32 block
     if(block_w==16){
         distortion = 0;
@@ -3820,19 +3744,6 @@
     mcf( 8,12)
     mcf(12,12)
 
-#define mcfh(dx,dy)\
-    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
-    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
-        mc_block_hpel ## dx ## dy ## 16;\
-    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
-    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
-        mc_block_hpel ## dx ## dy ## 8;
-
-    mcfh(0, 0)
-    mcfh(8, 0)
-    mcfh(0, 8)
-    mcfh(8, 8)
-
     if(!qexp[0])
         init_qexp();
 
@@ -4387,7 +4298,7 @@
 
 {   START_TIMER
         for(; yd<slice_h; yd+=4){
-            ff_spatial_idwt_buffered_slice(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
+            ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
         }
     STOP_TIMER("idwt slice");}
 
Index: libavcodec/dsputil.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.c,v
retrieving revision 1.134
diff -u -r1.134 dsputil.c
--- libavcodec/dsputil.c	10 Feb 2006 06:55:24 -0000	1.134
+++ libavcodec/dsputil.c	10 Mar 2006 16:36:16 -0000
@@ -3772,6 +3772,77 @@
     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
 }
 
+static always_inline void mc_block_x(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+    if (dy == 0) {
+        b_h -= 5;
+        tmp = dst;
+        src += 2*stride;
+    }
+    if (dx != 0) for(y=0; y < b_h+5; y++){
+        for(x=0; x < b_w; x++){
+            int a0= src[x    ];
+            int a1= src[x + 1];
+            int a2= src[x + 2];
+            int a3= src[x + 3];
+            int a4= src[x + 4];
+            int a5= src[x + 5];
+            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+
+            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
+            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
+
+            if(am&(~255)) am= ~(am>>31);
+
+            tmp[x] = am;
+        }
+        tmp += stride;
+        src += stride;
+    }
+}
+
+static always_inline void mc_block_y(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+    if (dx == 0) tmp = (uint8_t*)src + 2;
+    if (dy != 0) for(y=0; y < b_h; y++){
+        for(x=0; x < b_w; x++){
+            int a0= tmp[x + 0*stride];
+            int a1= tmp[x + 1*stride];
+            int a2= tmp[x + 2*stride];
+            int a3= tmp[x + 3*stride];
+            int a4= tmp[x + 4*stride];
+            int a5= tmp[x + 5*stride];
+            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+
+            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
+            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
+
+            if(am&(~255)) am= ~(am>>31);
+
+            dst[x] = am;
+        }
+        dst += stride;
+        tmp += stride;
+    }
+}
+
+#define mca(a)\
+static void mc_block_x ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+    mc_block_x(dst, src, tmp, stride, b_w, b_h, a, dy);\
+}\
+static void mc_block_y ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+    mc_block_y(dst, src, tmp, stride, b_w, b_h, dx, a);\
+}
+
+mca(0)
+mca(2)
+mca(4)
+mca(6)
+mca(8)
+mca(10)
+mca(12)
+mca(14)
+
 /* init static data */
 void dsputil_static_init(void)
 {
@@ -4047,6 +4118,28 @@
     c->try_8x8basis= try_8x8basis_c;
     c->add_8x8basis= add_8x8basis_c;
 
+    c->vertical_compose97i = ff_snow_vertical_compose97i;
+    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
+    c->inner_add_yblock = ff_snow_inner_add_yblock;
+
+    c->mc_block_x[0] = mc_block_x0;
+    c->mc_block_x[1] = mc_block_x2;
+    c->mc_block_x[2] = mc_block_x4;
+    c->mc_block_x[3] = mc_block_x6;
+    c->mc_block_x[4] = mc_block_x8;
+    c->mc_block_x[5] = mc_block_x10;
+    c->mc_block_x[6] = mc_block_x12;
+    c->mc_block_x[7] = mc_block_x14;
+
+    c->mc_block_y[0] = mc_block_y0;
+    c->mc_block_y[1] = mc_block_y2;
+    c->mc_block_y[2] = mc_block_y4;
+    c->mc_block_y[3] = mc_block_y6;
+    c->mc_block_y[4] = mc_block_y8;
+    c->mc_block_y[5] = mc_block_y10;
+    c->mc_block_y[6] = mc_block_y12;
+    c->mc_block_y[7] = mc_block_y14;
+
 #ifdef HAVE_MMX
     dsputil_init_mmx(c, avctx);
 #endif
Index: libavcodec/dsputil.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.h,v
retrieving revision 1.129
diff -u -r1.129 dsputil.h
--- libavcodec/dsputil.h	8 Mar 2006 04:13:55 -0000	1.129
+++ libavcodec/dsputil.h	10 Mar 2006 16:36:17 -0000
@@ -30,6 +30,7 @@
 
 #include "common.h"
 #include "avcodec.h"
+#include "snow.h"
 
 
 //#define DEBUG
@@ -132,6 +133,8 @@
 // allthough currently h<4 is not used as functions with width <8 are not used and neither implemented
 typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
 
+typedef void (*mc_block_func)(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy);
+
 
 /**
  * DSPContext.
@@ -334,6 +337,14 @@
     void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
     void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
     void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+
+    /* snow wavelet */
+    void (*vertical_compose97i)(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+    void (*horizontal_compose97i)(DWTELEM *b, int width);
+    void (*inner_add_yblock)(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+    mc_block_func mc_block_x[8];
+    mc_block_func mc_block_y[8];
 } DSPContext;
 
 void dsputil_static_init(void);
Index: libavcodec/Makefile
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/Makefile,v
retrieving revision 1.244
diff -u -r1.244 Makefile
--- libavcodec/Makefile	8 Mar 2006 04:13:55 -0000	1.244
+++ libavcodec/Makefile	10 Mar 2006 16:36:17 -0000
@@ -330,7 +330,7 @@
 	i386/dsputil_mmx.o i386/mpegvideo_mmx.o \
 	i386/idct_mmx.o i386/motion_est_mmx.o \
 	i386/simple_idct_mmx.o i386/fft_sse.o i386/vp3dsp_mmx.o \
-	i386/vp3dsp_sse2.o i386/fft_3dn.o i386/fft_3dn2.o
+	i386/vp3dsp_sse2.o i386/fft_3dn.o i386/fft_3dn2.o i386/snowdsp_mmx.o
 ifeq ($(CONFIG_GPL),yes)
 OBJS += i386/idct_mmx_xvid.o
 endif
Index: libavcodec/i386/mmx.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/mmx.h,v
retrieving revision 1.7
diff -u -r1.7 mmx.h
--- libavcodec/i386/mmx.h	22 Dec 2005 01:10:09 -0000	1.7
+++ libavcodec/i386/mmx.h	10 Mar 2006 16:36:17 -0000
@@ -12,6 +12,7 @@
 #  define REG_d "rdx"
 #  define REG_D "rdi"
 #  define REG_S "rsi"
+#  define PTR_SIZE "8"
 #else
 #  define REG_a "eax"
 #  define REG_b "ebx"
@@ -19,6 +20,7 @@
 #  define REG_d "edx"
 #  define REG_D "edi"
 #  define REG_S "esi"
+#  define PTR_SIZE "4"
 #endif
 
 /*
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx.c,v
retrieving revision 1.113
diff -u -r1.113 dsputil_mmx.c
--- libavcodec/i386/dsputil_mmx.c	7 Mar 2006 22:45:56 -0000	1.113
+++ libavcodec/i386/dsputil_mmx.c	10 Mar 2006 16:36:19 -0000
@@ -2564,6 +2564,150 @@
 }
 #endif
 
+static always_inline void mc_block_core(int dx, uint8_t * dst) {
+    asm volatile("punpcklbw  %%mm6, %%mm0     \n\t"
+                 "punpcklbw  %%mm6, %%mm1     \n\t"
+                 "punpcklbw  %%mm6, %%mm2     \n\t"
+                 "punpcklbw  %%mm6, %%mm3     \n\t"
+                 "punpcklbw  %%mm6, %%mm4     \n\t"
+                 "punpcklbw  %%mm6, %%mm5     \n\t"
+
+                 "paddsw     %%mm5, %%mm0     \n\t" // am += a5;
+
+                 "paddsw     %%mm4, %%mm1     \n\t" // a1 += a4;
+                 "movq       %%mm1, %%mm4     \n\t" // a4  = a1;
+                 "psllw         $2, %%mm1     \n\t" // a1 *=  4;
+                 "paddsw     %%mm4, %%mm1     \n\t" // a1 += a4;
+                 "psubsw     %%mm1, %%mm0     \n\t" // am -= a1;
+
+                 "movq       %%mm2, %%mm4     \n\t" // a4  = a2;
+                 "paddsw     %%mm3, %%mm4     \n\t" // a4 += a3;
+
+                 "psllw         $2, %%mm4     \n\t" // a4 *=  4;
+                 "paddsw     %%mm4, %%mm0     \n\t" // am += a1;
+                 "psllw         $2, %%mm4     \n\t" // a4 *=  4;
+                 "paddsw     %%mm4, %%mm0     \n\t" // am += a1;
+    ::);
+    switch (dx) {
+        case  2: asm volatile("psllw         $5, %%mm2     \n\t" // a2 <<= 5;
+                              "movq       %%mm2, %%mm3     \n\t" // a3   = a2;
+                              "psllw         $1, %%mm2     \n\t" // a2 <<= 1;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              ::); break;
+        case  4: asm volatile("psllw         $6, %%mm2     \n\t" // a2 <<= 6;
+                              "psllw         $1, %%mm0     \n\t" // am <<= 1;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              ::); break;
+        case  6: asm volatile("psllw         $5, %%mm2     \n\t" // a2 <<= 5;
+                              "movq       %%mm0, %%mm3     \n\t" // a3   = am;
+                              "psllw         $1, %%mm0     \n\t" // am <<= 1;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              ::); break;
+        case  8: asm volatile("psllw         $2, %%mm0     \n\t" // am <<= 2;
+                              ::); break;
+        case 10: asm volatile("psllw         $5, %%mm3     \n\t" // a3 <<= 5;
+                              "movq       %%mm0, %%mm2     \n\t" // a2   = am;
+                              "psllw         $1, %%mm0     \n\t" // am <<= 1;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              ::); break;
+        case 12: asm volatile("psllw         $6, %%mm3     \n\t" // a3 <<= 6;
+                              "psllw         $1, %%mm0     \n\t" // am <<= 1;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              ::); break;
+        case 14: asm volatile("psllw         $5, %%mm3     \n\t" // a3 <<= 5;
+                              "movq       %%mm3, %%mm2     \n\t" // a2   = a3;
+                              "psllw         $1, %%mm3     \n\t" // a3 <<= 1;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              ::); break;
+    }
+    asm volatile("paddsw     %%mm7, %%mm0     \n\t" // am += 64;
+                 "psraw         $7, %%mm0     \n\t" // am >>= 7;
+                 "packuswb   %%mm6, %%mm0     \n\t"
+                 "movd       %%mm0, (%0)      \n\t" // tmp[x] = am;
+    ::"r"(dst));
+}
+
+static always_inline void mc_block_x(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+    assert(!(b_w&3) && !(b_h&3) && !(dx&1));
+    asm volatile("pcmpeqw    %%mm7, %%mm7     \n\t"
+                 "psllw        $15, %%mm7     \n\t"
+                 "psrlw         $9, %%mm7     \n\t" // 64
+                 "pxor       %%mm6, %%mm6     \n\t" // 0
+    ::);
+    if (dy == 0) {
+        b_h -= 5;
+        tmp = dst;
+        src += 2*stride;
+    }
+    if (dx != 0) for(y=0; y < b_h+5; y++){
+        for(x=0; x < b_w; x += 4){
+            asm volatile("movd        (%0), %%mm0     \n\t" // am  = src[x    ];
+                         "movd       1(%0), %%mm1     \n\t" // a1  = src[x + 1];
+                         "movd       2(%0), %%mm2     \n\t" // a2  = src[x + 2];
+                         "movd       3(%0), %%mm3     \n\t" // a3  = src[x + 3];
+                         "movd       4(%0), %%mm4     \n\t" // a4  = src[x + 4];
+                         "movd       5(%0), %%mm5     \n\t" // a5  = src[x + 5];
+            ::"r"(&src[x]));
+
+            mc_block_core(dx, &tmp[x]);
+        }
+        tmp += stride;
+        src += stride;
+    }
+}
+
+static always_inline void mc_block_y(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+    if (dx == 0) tmp = (uint8_t*)src + 2;
+    if (dy != 0) for(y=0; y < b_h; y++){
+        for(x=0; x < b_w; x += 4){
+            asm volatile("movd        (%0), %%mm0     \n\t" // am  = tmp[x + 0*stride];
+                         "movd        (%1), %%mm1     \n\t" // a1  = tmp[x + 1*stride];
+                         "movd   (%0,%2,2), %%mm2     \n\t" // a2  = tmp[x + 2*stride];
+                         "movd   (%1,%2,2), %%mm3     \n\t" // a3  = tmp[x + 3*stride];
+                         "movd   (%0,%2,4), %%mm4     \n\t" // a4  = tmp[x + 4*stride];
+                         "movd   (%1,%2,4), %%mm5     \n\t" // a5  = tmp[x + 5*stride];
+            ::"r"(&tmp[x]),"r"(&tmp[x+stride]),"a"(stride));
+
+            mc_block_core(dy, &dst[x]);
+        }
+        dst += stride;
+        tmp += stride;
+    }
+    asm volatile("emms"::);
+}
+
+#define mca(a)\
+static void mc_block_x ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+    mc_block_x(dst, src, tmp, stride, b_w, b_h, a, dy);\
+}\
+static void mc_block_y ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+    mc_block_y(dst, src, tmp, stride, b_w, b_h, dx, a);\
+}
+
+mca(0)
+mca(2)
+mca(4)
+mca(6)
+mca(8)
+mca(10)
+mca(12)
+mca(14)
+
+extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
+extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
+extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+extern void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+extern void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     mm_flags = mm_support();
@@ -2950,6 +3094,35 @@
             c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
             c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
         }
+
+        if(mm_flags & MM_SSE2){
+            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+        }
+        else{
+            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+        }
+
+        c->mc_block_x[0] = mc_block_x0;
+        c->mc_block_x[1] = mc_block_x2;
+        c->mc_block_x[2] = mc_block_x4;
+        c->mc_block_x[3] = mc_block_x6;
+        c->mc_block_x[4] = mc_block_x8;
+        c->mc_block_x[5] = mc_block_x10;
+        c->mc_block_x[6] = mc_block_x12;
+        c->mc_block_x[7] = mc_block_x14;
+
+        c->mc_block_y[0] = mc_block_y0;
+        c->mc_block_y[1] = mc_block_y2;
+        c->mc_block_y[2] = mc_block_y4;
+        c->mc_block_y[3] = mc_block_y6;
+        c->mc_block_y[4] = mc_block_y8;
+        c->mc_block_y[5] = mc_block_y10;
+        c->mc_block_y[6] = mc_block_y12;
+        c->mc_block_y[7] = mc_block_y14;
     }
 
 #ifdef CONFIG_ENCODERS
--- /dev/null	2006-02-17 20:18:22.000000000 +0200
+++ libavcodec/snow.h	2006-03-10 18:33:42.000000000 +0200
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni at gmx.at>
+ * Copyright (C) 2006 Robert Edele <yartrebo at earthlink.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef _SNOW_H
+#define _SNOW_H
+
+
+#define MID_STATE 128
+
+#define MAX_DECOMPOSITIONS 8
+#define MAX_PLANES 4
+#define DWTELEM int
+#define QSHIFT 5
+#define QROOT (1<<QSHIFT)
+#define LOSSLESS_QLOG -128
+#define FRAC_BITS 8
+
+/** Used to minimize the amount of memory used in order to optimize cache performance. **/
+typedef struct {
+    DWTELEM * * line; ///< For use by idwt and predict_slices.
+    DWTELEM * * data_stack; ///< Used for internal purposes.
+    int data_stack_top;
+    int line_count;
+    int line_width;
+    int data_count;
+    DWTELEM * base_buffer; ///< Buffer that this structure is caching.
+} slice_buffer;
+
+#define liftS lift
+#define lift5 lift
+#if 1
+#define W_AM 3
+#define W_AO 0
+#define W_AS 1
+
+#undef liftS
+#define W_BM 1
+#define W_BO 8
+#define W_BS 4
+
+#define W_CM 1
+#define W_CO 0
+#define W_CS 0
+
+#define W_DM 3
+#define W_DO 4
+#define W_DS 3
+#elif 0
+#define W_AM 55
+#define W_AO 16
+#define W_AS 5
+
+#define W_BM 3
+#define W_BO 32
+#define W_BS 6
+
+#define W_CM 127
+#define W_CO 64
+#define W_CS 7
+
+#define W_DM 7
+#define W_DO 8
+#define W_DS 4
+#elif 0
+#define W_AM 97
+#define W_AO 32
+#define W_AS 6
+
+#define W_BM 63
+#define W_BO 512
+#define W_BS 10
+
+#define W_CM 13
+#define W_CO 8
+#define W_CS 4
+
+#define W_DM 15
+#define W_DO 16
+#define W_DS 5
+
+#else
+
+#define W_AM 203
+#define W_AO 64
+#define W_AS 7
+
+#define W_BM 217
+#define W_BO 2048
+#define W_BS 12
+
+#define W_CM 113
+#define W_CO 64
+#define W_CS 7
+
+#define W_DM 227
+#define W_DO 128
+#define W_DS 9
+#endif
+
+extern void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+extern void ff_snow_horizontal_compose97i(DWTELEM *b, int width);
+extern void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+#endif
--- /dev/null	2006-03-09 06:42:28.643578952 -0500
+++ libavcodec/i386/snowdsp_mmx.c	2006-03-09 18:52:28.941598104 -0500
@@ -0,0 +1,1445 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo at earthlink.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Snow MMX and SSE2 optimizations by Robert Edele <yartrebo at earthlink.net>
+ */
+
+#include "../avcodec.h"
+#include "../snow.h"
+#include "mmx.h"
+
+static always_inline snow_interleave_line_header(int * i, int width, DWTELEM * low, DWTELEM * high){
+    (*i) = (width) - 2;
+
+    if (width & 1){
+        low[(*i)+1] = low[((*i)+1)>>1];
+        (*i)--;
+    }
+}
+
+static always_inline snow_horizontal_compose_lift_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w, int lift_high, int mul, int add, int shift){
+    for(; i<w; i++){
+        dst[i] = src[i] - ((mul * (ref[i] + ref[i + 1]) + add) >> shift);
+    }
+
+    if((width^lift_high)&1){
+        dst[w] = src[w] - ((mul * 2 * ref[w] + add) >> shift);
+    }
+}
+
+static always_inline snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){
+        for(; i<w; i++){
+            dst[i] = src[i] - (((-(ref[i] + ref[(i+1)])+W_BO) - 4 * src[i]) >> W_BS);
+        }
+
+        if(width&1){
+            dst[w] = src[w] - (((-2 * ref[w] + W_BO) - 4 * src[w]) >> W_BS);
+        }
+}
+
+void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
+    const int w2= (width+1)>>1;
+    // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
+    DWTELEM temp_buf[(width>>1) + 4];
+    DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) >> 2);
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        DWTELEM * const ref = b + w2 - 1;
+        DWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+        // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+        // The savings in code and time are well worth having to store this value and
+        // calculate b[0] correctly afterwards.
+
+        i = 0;
+        asm volatile(
+            "pcmpeqd   %%xmm7, %%xmm7         \n\t"
+            "pslld        $31, %%xmm7         \n\t"
+            "psrld        $29, %%xmm7         \n\t"
+        ::);
+        for(; i<w_l-7; i+=8){
+            asm volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  4(%1), %%xmm2        \n\t"
+                "movdqu 20(%1), %%xmm6        \n\t"
+                "paddd  %%xmm1, %%xmm2        \n\t"
+                "paddd  %%xmm5, %%xmm6        \n\t"
+                "movdqa %%xmm2, %%xmm0        \n\t"
+                "movdqa %%xmm6, %%xmm4        \n\t"
+                "paddd  %%xmm2, %%xmm2        \n\t"
+                "paddd  %%xmm6, %%xmm6        \n\t"
+                "paddd  %%xmm0, %%xmm2        \n\t"
+                "paddd  %%xmm4, %%xmm6        \n\t"
+                "paddd  %%xmm7, %%xmm2        \n\t"
+                "paddd  %%xmm7, %%xmm6        \n\t"
+                "psrad      $3, %%xmm2        \n\t"
+                "psrad      $3, %%xmm6        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "psubd  %%xmm2, %%xmm0        \n\t"
+                "psubd  %%xmm6, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+        b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+    }
+
+    { // Lift 1
+        DWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
+            dst[i] = dst[i] - (b[i] + b[i + 1]);
+        }
+        for(; i<w_r-7; i+=8){
+            asm volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  4(%1), %%xmm2        \n\t"
+                "movdqu 20(%1), %%xmm6        \n\t"
+                "paddd  %%xmm1, %%xmm2        \n\t"
+                "paddd  %%xmm5, %%xmm6        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "psubd  %%xmm2, %%xmm0        \n\t"
+                "psubd  %%xmm6, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        DWTELEM * const ref = b+w2 - 1;
+        DWTELEM b_0 = b[0];
+
+        i = 0;
+        asm volatile(
+            "pslld          $1, %%xmm7       \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
+        ::);
+        for(; i<w_l-7; i+=8){
+            asm volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  4(%1), %%xmm0        \n\t"
+                "movdqu 20(%1), %%xmm4        \n\t"
+                "paddd  %%xmm1, %%xmm0        \n\t"
+                "paddd  %%xmm5, %%xmm4        \n\t"
+                "movdqa %%xmm7, %%xmm1        \n\t"
+                "movdqa %%xmm7, %%xmm5        \n\t"
+                "psubd  %%xmm0, %%xmm1        \n\t"
+                "psubd  %%xmm4, %%xmm5        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "pslld      $2, %%xmm0        \n\t"
+                "pslld      $2, %%xmm4        \n\t"
+                "psubd  %%xmm0, %%xmm1        \n\t"
+                "psubd  %%xmm4, %%xmm5        \n\t"
+                "psrad      $4, %%xmm1        \n\t"
+                "psrad      $4, %%xmm5        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "psubd  %%xmm1, %%xmm0        \n\t"
+                "psubd  %%xmm5, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+        b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
+    }
+
+    { // Lift 3
+        DWTELEM * const src = b+w2;
+
+        i = 0;
+        for(; (((long)&temp[i]) & 0xF) && i<w_r; i++){
+            temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+        }
+        for(; i<w_r-7; i+=8){
+            asm volatile(
+                "movdqu  4(%1), %%xmm2        \n\t"
+                "movdqu 20(%1), %%xmm6        \n\t"
+                "paddd    (%1), %%xmm2        \n\t"
+                "paddd  16(%1), %%xmm6        \n\t"
+                "movdqa %%xmm2, %%xmm0        \n\t"
+                "movdqa %%xmm6, %%xmm4        \n\t"
+                "pslld      $2, %%xmm2        \n\t"
+                "pslld      $2, %%xmm6        \n\t"
+                "psubd  %%xmm2, %%xmm0        \n\t"
+                "psubd  %%xmm6, %%xmm4        \n\t"
+                "psrad      $1, %%xmm0        \n\t"
+                "psrad      $1, %%xmm4        \n\t"
+                "movdqu   (%0), %%xmm2        \n\t"
+                "movdqu 16(%0), %%xmm6        \n\t"
+                "psubd  %%xmm0, %%xmm2        \n\t"
+                "psubd  %%xmm4, %%xmm6        \n\t"
+                "movdqa %%xmm2, (%2)          \n\t"
+                "movdqa %%xmm6, 16(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x1E) != 0x1E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=30; i>=0; i-=32){
+            asm volatile(
+                "movdqa      (%1), %%xmm0       \n\t"
+                "movdqa    16(%1), %%xmm2       \n\t"
+                "movdqa    32(%1), %%xmm4       \n\t"
+                "movdqa    48(%1), %%xmm6       \n\t"
+                "movdqa      (%1), %%xmm1       \n\t"
+                "movdqa    16(%1), %%xmm3       \n\t"
+                "movdqa    32(%1), %%xmm5       \n\t"
+                "movdqa    48(%1), %%xmm7       \n\t"
+                "punpckldq   (%2), %%xmm0       \n\t"
+                "punpckldq 16(%2), %%xmm2       \n\t"
+                "punpckldq 32(%2), %%xmm4       \n\t"
+                "punpckldq 48(%2), %%xmm6       \n\t"
+                "movdqa    %%xmm0, (%0)         \n\t"
+                "movdqa    %%xmm2, 32(%0)       \n\t"
+                "movdqa    %%xmm4, 64(%0)       \n\t"
+                "movdqa    %%xmm6, 96(%0)       \n\t"
+                "punpckhdq   (%2), %%xmm1       \n\t"
+                "punpckhdq 16(%2), %%xmm3       \n\t"
+                "punpckhdq 32(%2), %%xmm5       \n\t"
+                "punpckhdq 48(%2), %%xmm7       \n\t"
+                "movdqa    %%xmm1, 16(%0)       \n\t"
+                "movdqa    %%xmm3, 48(%0)       \n\t"
+                "movdqa    %%xmm5, 80(%0)       \n\t"
+                "movdqa    %%xmm7, 112(%0)      \n\t"
+                :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
+    const int w2= (width+1)>>1;
+    DWTELEM temp[width >> 1];
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        DWTELEM * const ref = b + w2 - 1;
+
+        i = 1;
+        b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+        asm volatile(
+            "pcmpeqd    %%mm7, %%mm7         \n\t"
+            "pslld        $31, %%mm7         \n\t"
+            "psrld        $29, %%mm7         \n\t"
+           ::);
+        for(; i<w_l-3; i+=4){
+            asm volatile(
+                "movq     (%1), %%mm1        \n\t"
+                "movq    8(%1), %%mm5        \n\t"
+                "movq    4(%1), %%mm2        \n\t"
+                "movq   12(%1), %%mm6        \n\t"
+                "paddd   %%mm1, %%mm2        \n\t"
+                "paddd   %%mm5, %%mm6        \n\t"
+                "movq    %%mm2, %%mm0        \n\t"
+                "movq    %%mm6, %%mm4        \n\t"
+                "paddd   %%mm2, %%mm2        \n\t"
+                "paddd   %%mm6, %%mm6        \n\t"
+                "paddd   %%mm0, %%mm2        \n\t"
+                "paddd   %%mm4, %%mm6        \n\t"
+                "paddd   %%mm7, %%mm2        \n\t"
+                "paddd   %%mm7, %%mm6        \n\t"
+                "psrad      $3, %%mm2        \n\t"
+                "psrad      $3, %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "psubd   %%mm2, %%mm0        \n\t"
+                "psubd   %%mm6, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+    }
+
+    { // Lift 1
+        DWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; i<w_r-3; i+=4){
+            asm volatile(
+                "movq     (%1), %%mm1        \n\t"
+                "movq    8(%1), %%mm5        \n\t"
+                "movq    4(%1), %%mm2        \n\t"
+                "movq   12(%1), %%mm6        \n\t"
+                "paddd   %%mm1, %%mm2        \n\t"
+                "paddd   %%mm5, %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "psubd   %%mm2, %%mm0        \n\t"
+                "psubd   %%mm6, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        DWTELEM * const ref = b+w2 - 1;
+
+        i = 1;
+        b[0] = b[0] - (((-2 * ref[1] + W_BO) - 4 * b[0]) >> W_BS);
+        asm volatile(
+            "pslld          $1, %%mm7       \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
+           ::);
+        for(; i<w_l-3; i+=4){
+            asm volatile(
+                "movq     (%1), %%mm1        \n\t"
+                "movq    8(%1), %%mm5        \n\t"
+                "movq    4(%1), %%mm0        \n\t"
+                "movq   12(%1), %%mm4        \n\t"
+                "paddd   %%mm1, %%mm0        \n\t"
+                "paddd   %%mm5, %%mm4        \n\t"
+                "movq    %%mm7, %%mm1        \n\t"
+                "movq    %%mm7, %%mm5        \n\t"
+                "psubd   %%mm0, %%mm1        \n\t"
+                "psubd   %%mm4, %%mm5        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "pslld      $2, %%mm0        \n\t"
+                "pslld      $2, %%mm4        \n\t"
+                "psubd   %%mm0, %%mm1        \n\t"
+                "psubd   %%mm4, %%mm5        \n\t"
+                "psrad      $4, %%mm1        \n\t"
+                "psrad      $4, %%mm5        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "psubd   %%mm1, %%mm0        \n\t"
+                "psubd   %%mm5, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+    }
+
+    { // Lift 3
+        DWTELEM * const src = b+w2;
+        i = 0;
+
+        for(; i<w_r-3; i+=4){
+            asm volatile(
+                "movq    4(%1), %%mm2        \n\t"
+                "movq   12(%1), %%mm6        \n\t"
+                "paddd    (%1), %%mm2        \n\t"
+                "paddd   8(%1), %%mm6        \n\t"
+                "movq    %%mm2, %%mm0        \n\t"
+                "movq    %%mm6, %%mm4        \n\t"
+                "pslld      $2, %%mm2        \n\t"
+                "pslld      $2, %%mm6        \n\t"
+                "psubd   %%mm2, %%mm0        \n\t"
+                "psubd   %%mm6, %%mm4        \n\t"
+                "psrad      $1, %%mm0        \n\t"
+                "psrad      $1, %%mm4        \n\t"
+                "movq     (%0), %%mm2        \n\t"
+                "movq    8(%0), %%mm6        \n\t"
+                "psubd   %%mm0, %%mm2        \n\t"
+                "psubd   %%mm4, %%mm6        \n\t"
+                "movq    %%mm2, (%2)         \n\t"
+                "movq    %%mm6, 8(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0xE) != 0xE; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=14; i>=0; i-=16){
+            asm volatile(
+                "movq        (%1), %%mm0       \n\t"
+                "movq       8(%1), %%mm2       \n\t"
+                "movq      16(%1), %%mm4       \n\t"
+                "movq      24(%1), %%mm6       \n\t"
+                "movq        (%1), %%mm1       \n\t"
+                "movq       8(%1), %%mm3       \n\t"
+                "movq      16(%1), %%mm5       \n\t"
+                "movq      24(%1), %%mm7       \n\t"
+                "punpckldq   (%2), %%mm0       \n\t"
+                "punpckldq  8(%2), %%mm2       \n\t"
+                "punpckldq 16(%2), %%mm4       \n\t"
+                "punpckldq 24(%2), %%mm6       \n\t"
+                "movq       %%mm0, (%0)        \n\t"
+                "movq       %%mm2, 16(%0)      \n\t"
+                "movq       %%mm4, 32(%0)      \n\t"
+                "movq       %%mm6, 48(%0)      \n\t"
+                "punpckhdq   (%2), %%mm1       \n\t"
+                "punpckhdq  8(%2), %%mm3       \n\t"
+                "punpckhdq 16(%2), %%mm5       \n\t"
+                "punpckhdq 24(%2), %%mm7       \n\t"
+                "movq       %%mm1, 8(%0)       \n\t"
+                "movq       %%mm3, 24(%0)      \n\t"
+                "movq       %%mm5, 40(%0)      \n\t"
+                "movq       %%mm7, 56(%0)      \n\t"
+                :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
+    long i = width;
+
+    while(i & 0xF)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+
+         asm volatile (
+        "mov %6, %%"REG_d"                           \n\t"
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+
+        "mov %5, %%"REG_a"                           \n\t"
+        "mov %3, %%"REG_b"                           \n\t"
+
+        "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0      \n\t"
+        "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2    \n\t"
+        "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4    \n\t"
+        "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6    \n\t"
+
+        "paddd (%%"REG_a",%%"REG_d",4), %%xmm0       \n\t"
+        "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2     \n\t"
+        "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4     \n\t"
+        "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6     \n\t"
+
+        "movdqa %%xmm0, %%xmm1                       \n\t"
+        "movdqa %%xmm2, %%xmm3                       \n\t"
+        "movdqa %%xmm4, %%xmm5                       \n\t"
+        "movdqa %%xmm6, %%xmm7                       \n\t"
+
+        "pslld $1, %%xmm0                            \n\t"
+        "pslld $1, %%xmm2                            \n\t"
+        "pslld $1, %%xmm4                            \n\t"
+        "pslld $1, %%xmm6                            \n\t"
+
+        "paddd %%xmm1, %%xmm0                        \n\t"
+        "paddd %%xmm3, %%xmm2                        \n\t"
+        "paddd %%xmm5, %%xmm4                        \n\t"
+        "paddd %%xmm7, %%xmm6                        \n\t"
+
+        "pcmpeqd %%xmm1, %%xmm1                      \n\t"
+        "pslld $31, %%xmm1                           \n\t"
+        "psrld $29, %%xmm1                           \n\t"
+        "mov %4, %%"REG_a"                           \n\t"
+
+        "paddd %%xmm1, %%xmm0                        \n\t"
+        "paddd %%xmm1, %%xmm2                        \n\t"
+        "paddd %%xmm1, %%xmm4                        \n\t"
+        "paddd %%xmm1, %%xmm6                        \n\t"
+
+        "psrad $3, %%xmm0                            \n\t"
+        "psrad $3, %%xmm2                            \n\t"
+        "psrad $3, %%xmm4                            \n\t"
+        "psrad $3, %%xmm6                            \n\t"
+
+        "movdqa (%%"REG_a",%%"REG_d",4), %%xmm1      \n\t"
+        "movdqa 16(%%"REG_a",%%"REG_d",4), %%xmm3    \n\t"
+        "movdqa 32(%%"REG_a",%%"REG_d",4), %%xmm5    \n\t"
+        "movdqa 48(%%"REG_a",%%"REG_d",4), %%xmm7    \n\t"
+
+        "psubd %%xmm0, %%xmm1                        \n\t"
+        "psubd %%xmm2, %%xmm3                        \n\t"
+        "psubd %%xmm4, %%xmm5                        \n\t"
+        "psubd %%xmm6, %%xmm7                        \n\t"
+
+        "movdqa %%xmm1, (%%"REG_a",%%"REG_d",4)      \n\t"
+        "movdqa %%xmm3, 16(%%"REG_a",%%"REG_d",4)    \n\t"
+        "movdqa %%xmm5, 32(%%"REG_a",%%"REG_d",4)    \n\t"
+        "movdqa %%xmm7, 48(%%"REG_a",%%"REG_d",4)    \n\t"
+
+        "mov %2, %%"REG_c"                           \n\t"
+
+        "paddd (%%"REG_c",%%"REG_d",4), %%xmm1       \n\t"
+        "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm3     \n\t"
+        "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm5     \n\t"
+        "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm7     \n\t"
+
+        "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0      \n\t"
+        "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2    \n\t"
+        "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4    \n\t"
+        "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6    \n\t"
+
+        "psubd %%xmm1, %%xmm0                        \n\t"
+        "psubd %%xmm3, %%xmm2                        \n\t"
+        "psubd %%xmm5, %%xmm4                        \n\t"
+        "psubd %%xmm7, %%xmm6                        \n\t"
+
+        "movdqa %%xmm0, (%%"REG_b",%%"REG_d",4)      \n\t"
+        "movdqa %%xmm2, 16(%%"REG_b",%%"REG_d",4)    \n\t"
+        "movdqa %%xmm4, 32(%%"REG_b",%%"REG_d",4)    \n\t"
+        "movdqa %%xmm6, 48(%%"REG_b",%%"REG_d",4)    \n\t"
+
+        "mov %1, %%"REG_a"                           \n\t"
+
+        "paddd (%%"REG_a",%%"REG_d",4), %%xmm0       \n\t"
+        "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2     \n\t"
+        "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4     \n\t"
+        "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6     \n\t"
+
+        "movdqa (%%"REG_c",%%"REG_d",4), %%xmm1      \n\t"
+        "movdqa 16(%%"REG_c",%%"REG_d",4), %%xmm3    \n\t"
+        "movdqa 32(%%"REG_c",%%"REG_d",4), %%xmm5    \n\t"
+        "movdqa 48(%%"REG_c",%%"REG_d",4), %%xmm7    \n\t"
+
+        "pslld $2, %%xmm1                            \n\t"
+        "pslld $2, %%xmm3                            \n\t"
+        "pslld $2, %%xmm5                            \n\t"
+        "pslld $2, %%xmm7                            \n\t"
+
+        "paddd %%xmm1, %%xmm0                        \n\t"
+        "paddd %%xmm3, %%xmm2                        \n\t"
+        "paddd %%xmm5, %%xmm4                        \n\t"
+        "paddd %%xmm7, %%xmm6                        \n\t"
+
+        "pcmpeqd %%xmm1, %%xmm1                      \n\t"
+        "pslld $31, %%xmm1                           \n\t"
+        "psrld $28, %%xmm1                           \n\t"
+        "mov %0, %%"REG_b"                           \n\t"
+
+        "paddd %%xmm1, %%xmm0                        \n\t"
+        "paddd %%xmm1, %%xmm2                        \n\t"
+        "paddd %%xmm1, %%xmm4                        \n\t"
+        "paddd %%xmm1, %%xmm6                        \n\t"
+
+        "psrad $4, %%xmm0                            \n\t"
+        "psrad $4, %%xmm2                            \n\t"
+        "psrad $4, %%xmm4                            \n\t"
+        "psrad $4, %%xmm6                            \n\t"
+
+        "paddd (%%"REG_c",%%"REG_d",4), %%xmm0       \n\t"
+        "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm2     \n\t"
+        "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm4     \n\t"
+        "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm6     \n\t"
+
+        "movdqa %%xmm0, (%%"REG_c",%%"REG_d",4)      \n\t"
+        "movdqa %%xmm2, 16(%%"REG_c",%%"REG_d",4)    \n\t"
+        "movdqa %%xmm4, 32(%%"REG_c",%%"REG_d",4)    \n\t"
+        "movdqa %%xmm6, 48(%%"REG_c",%%"REG_d",4)    \n\t"
+
+        "paddd (%%"REG_b",%%"REG_d",4), %%xmm0       \n\t"
+        "paddd 16(%%"REG_b",%%"REG_d",4), %%xmm2     \n\t"
+        "paddd 32(%%"REG_b",%%"REG_d",4), %%xmm4     \n\t"
+        "paddd 48(%%"REG_b",%%"REG_d",4), %%xmm6     \n\t"
+
+        "movdqa %%xmm0, %%xmm1                       \n\t"
+        "movdqa %%xmm2, %%xmm3                       \n\t"
+        "movdqa %%xmm4, %%xmm5                       \n\t"
+        "movdqa %%xmm6, %%xmm7                       \n\t"
+
+        "pslld $1, %%xmm0                            \n\t"
+        "pslld $1, %%xmm2                            \n\t"
+        "pslld $1, %%xmm4                            \n\t"
+        "pslld $1, %%xmm6                            \n\t"
+
+        "paddd %%xmm1, %%xmm0                        \n\t"
+        "paddd %%xmm3, %%xmm2                        \n\t"
+        "paddd %%xmm5, %%xmm4                        \n\t"
+        "paddd %%xmm7, %%xmm6                        \n\t"
+
+        "psrad $1, %%xmm0                            \n\t"
+        "psrad $1, %%xmm2                            \n\t"
+        "psrad $1, %%xmm4                            \n\t"
+        "psrad $1, %%xmm6                            \n\t"
+
+        "paddd (%%"REG_a",%%"REG_d",4), %%xmm0       \n\t"
+        "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2     \n\t"
+        "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4     \n\t"
+        "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6     \n\t"
+
+        "movdqa %%xmm0, (%%"REG_a",%%"REG_d",4)      \n\t"
+        "movdqa %%xmm2, 16(%%"REG_a",%%"REG_d",4)    \n\t"
+        "movdqa %%xmm4, 32(%%"REG_a",%%"REG_d",4)    \n\t"
+        "movdqa %%xmm6, 48(%%"REG_a",%%"REG_d",4)    \n\t"
+
+        "2:                                          \n\t"
+        "sub $16, %%"REG_d"                          \n\t"
+        "jge 1b                                      \n\t"
+        ::
+        "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5),"rm"(i):
+        "%"REG_a"","%"REG_b"","%"REG_c"", "%"REG_d"");
+}
+
+void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
+    long i = width;
+    while(i & 0x7)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+   
+    asm volatile(
+        "mov %6, %%"REG_d"                           \n\t"
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+
+        "mov %5, %%"REG_a"                           \n\t"
+        "mov %3, %%"REG_b"                           \n\t"
+
+        "movq (%%"REG_b",%%"REG_d",4), %%mm0         \n\t"
+        "movq 8(%%"REG_b",%%"REG_d",4), %%mm2        \n\t"
+        "movq 16(%%"REG_b",%%"REG_d",4), %%mm4       \n\t"
+        "movq 24(%%"REG_b",%%"REG_d",4), %%mm6       \n\t"
+
+        "paddd (%%"REG_a",%%"REG_d",4), %%mm0        \n\t"
+        "paddd 8(%%"REG_a",%%"REG_d",4), %%mm2       \n\t"
+        "paddd 16(%%"REG_a",%%"REG_d",4), %%mm4      \n\t"
+        "paddd 24(%%"REG_a",%%"REG_d",4), %%mm6      \n\t"
+
+        "movq %%mm0, %%mm1                           \n\t"
+        "movq %%mm2, %%mm3                           \n\t"
+        "movq %%mm4, %%mm5                           \n\t"
+        "movq %%mm6, %%mm7                           \n\t"
+
+        "pslld $1, %%mm0                             \n\t"
+        "pslld $1, %%mm2                             \n\t"
+        "pslld $1, %%mm4                             \n\t"
+        "pslld $1, %%mm6                             \n\t"
+
+        "paddd %%mm1, %%mm0                          \n\t"
+        "paddd %%mm3, %%mm2                          \n\t"
+        "paddd %%mm5, %%mm4                          \n\t"
+        "paddd %%mm7, %%mm6                          \n\t"
+
+        "pcmpeqd %%mm1, %%mm1                        \n\t"
+        "pslld $31, %%mm1                            \n\t"
+        "psrld $29, %%mm1                            \n\t"
+        "mov %4, %%"REG_a"                           \n\t"
+
+        "paddd %%mm1, %%mm0                          \n\t"
+        "paddd %%mm1, %%mm2                          \n\t"
+        "paddd %%mm1, %%mm4                          \n\t"
+        "paddd %%mm1, %%mm6                          \n\t"
+
+        "psrad $3, %%mm0                             \n\t"
+        "psrad $3, %%mm2                             \n\t"
+        "psrad $3, %%mm4                             \n\t"
+        "psrad $3, %%mm6                             \n\t"
+
+        "movq (%%"REG_a",%%"REG_d",4), %%mm1         \n\t"
+        "movq 8(%%"REG_a",%%"REG_d",4), %%mm3        \n\t"
+        "movq 16(%%"REG_a",%%"REG_d",4), %%mm5       \n\t"
+        "movq 24(%%"REG_a",%%"REG_d",4), %%mm7       \n\t"
+
+        "psubd %%mm0, %%mm1                          \n\t"
+        "psubd %%mm2, %%mm3                          \n\t"
+        "psubd %%mm4, %%mm5                          \n\t"
+        "psubd %%mm6, %%mm7                          \n\t"
+
+        "movq %%mm1, (%%"REG_a",%%"REG_d",4)         \n\t"
+        "movq %%mm3, 8(%%"REG_a",%%"REG_d",4)        \n\t"
+        "movq %%mm5, 16(%%"REG_a",%%"REG_d",4)       \n\t"
+        "movq %%mm7, 24(%%"REG_a",%%"REG_d",4)       \n\t"
+
+        "mov %2, %%"REG_c"                           \n\t"
+
+        "paddd (%%"REG_c",%%"REG_d",4), %%mm1        \n\t"
+        "paddd 8(%%"REG_c",%%"REG_d",4), %%mm3       \n\t"
+        "paddd 16(%%"REG_c",%%"REG_d",4), %%mm5      \n\t"
+        "paddd 24(%%"REG_c",%%"REG_d",4), %%mm7      \n\t"
+
+        "movq (%%"REG_b",%%"REG_d",4), %%mm0         \n\t"
+        "movq 8(%%"REG_b",%%"REG_d",4), %%mm2        \n\t"
+        "movq 16(%%"REG_b",%%"REG_d",4), %%mm4       \n\t"
+        "movq 24(%%"REG_b",%%"REG_d",4), %%mm6       \n\t"
+
+        "psubd %%mm1, %%mm0                          \n\t"
+        "psubd %%mm3, %%mm2                          \n\t"
+        "psubd %%mm5, %%mm4                          \n\t"
+        "psubd %%mm7, %%mm6                          \n\t"
+
+        "movq %%mm0, (%%"REG_b",%%"REG_d",4)         \n\t"
+        "movq %%mm2, 8(%%"REG_b",%%"REG_d",4)        \n\t"
+        "movq %%mm4, 16(%%"REG_b",%%"REG_d",4)       \n\t"
+        "movq %%mm6, 24(%%"REG_b",%%"REG_d",4)       \n\t"
+
+        "mov %1, %%"REG_a"                           \n\t"
+
+        "paddd (%%"REG_a",%%"REG_d",4), %%mm0        \n\t"
+        "paddd 8(%%"REG_a",%%"REG_d",4), %%mm2       \n\t"
+        "paddd 16(%%"REG_a",%%"REG_d",4), %%mm4      \n\t"
+        "paddd 24(%%"REG_a",%%"REG_d",4), %%mm6      \n\t"
+
+        "movq (%%"REG_c",%%"REG_d",4), %%mm1         \n\t"
+        "movq 8(%%"REG_c",%%"REG_d",4), %%mm3        \n\t"
+        "movq 16(%%"REG_c",%%"REG_d",4), %%mm5       \n\t"
+        "movq 24(%%"REG_c",%%"REG_d",4), %%mm7       \n\t"
+
+        "pslld $2, %%mm1                             \n\t"
+        "pslld $2, %%mm3                             \n\t"
+        "pslld $2, %%mm5                             \n\t"
+        "pslld $2, %%mm7                             \n\t"
+
+        "paddd %%mm1, %%mm0                          \n\t"
+        "paddd %%mm3, %%mm2                          \n\t"
+        "paddd %%mm5, %%mm4                          \n\t"
+        "paddd %%mm7, %%mm6                          \n\t"
+
+        "pcmpeqd %%mm1, %%mm1                        \n\t"
+        "pslld $31, %%mm1                            \n\t"
+        "psrld $28, %%mm1                            \n\t"
+        "mov %0, %%"REG_b"                           \n\t"
+
+        "paddd %%mm1, %%mm0                          \n\t"
+        "paddd %%mm1, %%mm2                          \n\t"
+        "paddd %%mm1, %%mm4                          \n\t"
+        "paddd %%mm1, %%mm6                          \n\t"
+
+        "psrad $4, %%mm0                             \n\t"
+        "psrad $4, %%mm2                             \n\t"
+        "psrad $4, %%mm4                             \n\t"
+        "psrad $4, %%mm6                             \n\t"
+
+        "paddd (%%"REG_c",%%"REG_d",4), %%mm0        \n\t"
+        "paddd 8(%%"REG_c",%%"REG_d",4), %%mm2       \n\t"
+        "paddd 16(%%"REG_c",%%"REG_d",4), %%mm4      \n\t"
+        "paddd 24(%%"REG_c",%%"REG_d",4), %%mm6      \n\t"
+
+        "movq %%mm0, (%%"REG_c",%%"REG_d",4)         \n\t"
+        "movq %%mm2, 8(%%"REG_c",%%"REG_d",4)        \n\t"
+        "movq %%mm4, 16(%%"REG_c",%%"REG_d",4)       \n\t"
+        "movq %%mm6, 24(%%"REG_c",%%"REG_d",4)       \n\t"
+
+        "paddd (%%"REG_b",%%"REG_d",4), %%mm0        \n\t"
+        "paddd 8(%%"REG_b",%%"REG_d",4), %%mm2       \n\t"
+        "paddd 16(%%"REG_b",%%"REG_d",4), %%mm4      \n\t"
+        "paddd 24(%%"REG_b",%%"REG_d",4), %%mm6      \n\t"
+
+        "movq %%mm0, %%mm1                           \n\t"
+        "movq %%mm2, %%mm3                           \n\t"
+        "movq %%mm4, %%mm5                           \n\t"
+        "movq %%mm6, %%mm7                           \n\t"
+
+        "pslld $1, %%mm0                             \n\t"
+        "pslld $1, %%mm2                             \n\t"
+        "pslld $1, %%mm4                             \n\t"
+        "pslld $1, %%mm6                             \n\t"
+
+        "paddd %%mm1, %%mm0                          \n\t"
+        "paddd %%mm3, %%mm2                          \n\t"
+        "paddd %%mm5, %%mm4                          \n\t"
+        "paddd %%mm7, %%mm6                          \n\t"
+
+        "psrad $1, %%mm0                             \n\t"
+        "psrad $1, %%mm2                             \n\t"
+        "psrad $1, %%mm4                             \n\t"
+        "psrad $1, %%mm6                             \n\t"
+
+        "paddd (%%"REG_a",%%"REG_d",4), %%mm0        \n\t"
+        "paddd 8(%%"REG_a",%%"REG_d",4), %%mm2       \n\t"
+        "paddd 16(%%"REG_a",%%"REG_d",4), %%mm4      \n\t"
+        "paddd 24(%%"REG_a",%%"REG_d",4), %%mm6      \n\t"
+
+        "movq %%mm0, (%%"REG_a",%%"REG_d",4)         \n\t"
+        "movq %%mm2, 8(%%"REG_a",%%"REG_d",4)        \n\t"
+        "movq %%mm4, 16(%%"REG_a",%%"REG_d",4)       \n\t"
+        "movq %%mm6, 24(%%"REG_a",%%"REG_d",4)       \n\t"
+
+        "2:                                          \n\t"
+        "sub $8, %%"REG_d"                           \n\t"
+        "jge 1b                                      \n\t"
+        ::
+        "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5),"rm"(i):
+        "%"REG_a"","%"REG_b"","%"REG_c"", "%"REG_d"");
+}
+
+
+
+static inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    DWTELEM * * dst_array = sb->line + src_y;
+
+    asm volatile(
+             "mov  %6, %%"REG_c"             \n\t"
+             "mov  %5, %%"REG_b"             \n\t"
+             "mov  %3, %%"REG_S"             \n\t"
+             "pcmpeqd %%xmm4, %%xmm4         \n\t"
+             "pslld $31, %%xmm4              \n\t"
+             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */
+             "psrld $24, %%xmm4              \n\t" /* FRAC_BITS >> 1 */
+
+             "1:                              \n\t"
+             "movq (%%"REG_S"), %%xmm0       \n\t"
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%xmm7, %%xmm0       \n\t"
+             "movq 8(%%"REG_S"), %%xmm1      \n\t"
+             "punpcklbw %%xmm7, %%xmm1       \n\t"
+             "movq (%%"REG_d"), %%xmm5       \n\t"
+             "mov %1, %%"REG_D"              \n\t"
+             "punpcklbw %%xmm7, %%xmm5       \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movq (%%"REG_d"), %%xmm6       \n\t"
+             "pmullw %%xmm0, %%xmm5          \n\t"
+             "punpcklbw %%xmm7, %%xmm6       \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+             "mov (%%"REG_D"), %%"REG_D"     \n\t"
+
+             "movq 128(%%"REG_S"), %%xmm0    \n\t"
+             "pmullw %%xmm1, %%xmm6          \n\t"
+             "punpcklbw %%xmm7, %%xmm0       \n\t"
+             "movq 136(%%"REG_S"), %%xmm1    \n\t"
+             "add %2, %%"REG_D"              \n\t"
+             "punpcklbw %%xmm7, %%xmm1       \n\t"
+             "movq (%%"REG_d"), %%xmm2       \n\t"
+             "punpcklbw %%xmm7, %%xmm2       \n\t"
+             "mov (%%"REG_a"), %%"REG_d"     \n\t"
+             "paddusw %%xmm5, %%xmm6         \n\t"
+             "pmullw %%xmm0, %%xmm2          \n\t"
+             "movq (%%"REG_d"), %%xmm3       \n\t"
+             "mov %0, %%"REG_d"              \n\t"
+             "punpcklbw %%xmm7, %%xmm3       \n\t"
+             "paddusw %%xmm2, %%xmm6         \n\t"
+             "pmullw %%xmm1, %%xmm3          \n\t"
+             "paddusw %%xmm3, %%xmm6         \n\t"
+
+             "movdqa (%%"REG_D"), %%xmm3     \n\t"
+             "movdqa %%xmm6, %%xmm0          \n\t"
+             "movdqa 16(%%"REG_D"), %%xmm5   \n\t"
+             "punpckhwd %%xmm7, %%xmm6       \n\t"
+             "movq 24(%%"REG_S"), %%xmm1     \n\t"
+             "punpcklwd %%xmm7, %%xmm0       \n\t"
+             "paddd %%xmm0, %%xmm3           \n\t"
+             "paddd %%xmm6, %%xmm5           \n\t"
+             "punpcklbw %%xmm7, %%xmm1       \n\t"
+             "paddd %%xmm4, %%xmm3           \n\t"
+             "paddd %%xmm4, %%xmm5           \n\t"
+             "movq 16(%%"REG_S"), %%xmm0     \n\t"
+             "psrad $8, %%xmm3               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
+
+             "packssdw %%xmm5, %%xmm3        \n\t"
+             "mov %1, %%"REG_D"              \n\t"
+             "packuswb %%xmm7, %%xmm3        \n\t"
+
+             "movq %%xmm3, (%%"REG_d")       \n\t"
+
+
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%xmm7, %%xmm0       \n\t"
+             "movq (%%"REG_d",%%"REG_c"), %%xmm5; \n\t"
+             "punpcklbw %%xmm7, %%xmm5       \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movq (%%"REG_d",%%"REG_c"), %%xmm6; \n\t"
+             "pmullw %%xmm0, %%xmm5          \n\t"
+             "punpcklbw %%xmm7, %%xmm6       \n\t"
+
+             "movq 144(%%"REG_S"), %%xmm0    \n\t"
+             "pmullw %%xmm1, %%xmm6          \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%xmm7, %%xmm0       \n\t"
+             "movq 152(%%"REG_S"), %%xmm1    \n\t"
+             "punpcklbw %%xmm7, %%xmm1       \n\t"
+             "movq (%%"REG_d",%%"REG_c"), %%xmm2;\n\t"
+             "punpcklbw %%xmm7, %%xmm2       \n\t"
+             "mov (%%"REG_a"), %%"REG_d"     \n\t"
+             "paddusw %%xmm5, %%xmm6         \n\t"
+             "pmullw %%xmm0, %%xmm2          \n\t"
+             "movq (%%"REG_d",%%"REG_c"), %%xmm3;\n\t"
+             "punpcklbw %%xmm7, %%xmm3       \n\t"
+             "paddusw %%xmm2, %%xmm6         \n\t"
+             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
+             "pmullw %%xmm1, %%xmm3          \n\t"
+             "sal $1, %%"REG_c"              \n\t"
+             "add %2, %%"REG_D"              \n\t"
+             "paddusw %%xmm3, %%xmm6         \n\t"
+             "mov %0, %%"REG_d"              \n\t"
+
+             "movdqa (%%"REG_D"), %%xmm3     \n\t"
+             "movdqa %%xmm6, %%xmm0          \n\t"
+             "movdqa 16(%%"REG_D"), %%xmm5   \n\t"
+             "punpckhwd %%xmm7, %%xmm6       \n\t"
+             "punpcklwd %%xmm7, %%xmm0       \n\t"
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
+             "paddd %%xmm0, %%xmm3           \n\t"
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
+             "paddd %%xmm6, %%xmm5           \n\t"
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
+             "paddd %%xmm4, %%xmm3           \n\t"
+             "add %%"REG_c", (%%"REG_a")     \n\t"
+             "paddd %%xmm4, %%xmm5           \n\t"
+             "psrad $8, %%xmm3               \n\t" /* FRAC_BITS. */
+             "add $"PTR_SIZE"*2, %1          \n\t"
+             "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
+             "add $32, %%"REG_S"             \n\t"
+
+             "packssdw %%xmm5, %%xmm3        \n\t"
+             "add %%"REG_c", %0              \n\t"
+             "packuswb %%xmm7, %%xmm3        \n\t"
+
+             "sar $1, %%"REG_c"              \n\t"
+             "movq %%xmm3, (%%"REG_d",%%"REG_c");\n\t"
+
+             "sub $2, %%"REG_b"              \n\t"
+             "jnz 1b                         \n\t"
+             :
+             :
+             "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):
+             "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+}
+
+static inner_add_yblock_bw_16_obmc_32_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    DWTELEM * * dst_array = sb->line + src_y;
+
+    asm volatile(
+             "mov  %6, %%"REG_c"             \n\t"
+             "mov  %5, %%"REG_b"             \n\t"
+             "mov  %3, %%"REG_S"             \n\t"
+             "pcmpeqd %%xmm4, %%xmm4         \n\t"
+             "pslld $31, %%xmm4              \n\t"
+             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */
+             "psrld $24, %%xmm4              \n\t" /* FRAC_BITS >> 1 */
+
+             "1:                              \n\t"
+             "movq (%%"REG_S"), %%xmm0       \n\t"
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%xmm7, %%xmm0       \n\t"
+             "movq 16(%%"REG_S"), %%xmm1     \n\t"
+             "punpcklbw %%xmm7, %%xmm1       \n\t"
+             "movq (%%"REG_d"), %%xmm5       \n\t"
+             "mov %1, %%"REG_D"              \n\t"
+             "punpcklbw %%xmm7, %%xmm5       \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movq (%%"REG_d"), %%xmm6       \n\t"
+             "pmullw %%xmm0, %%xmm5          \n\t"
+             "punpcklbw %%xmm7, %%xmm6       \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+
+             "movq 512(%%"REG_S"), %%xmm0    \n\t"
+             "pmullw %%xmm1, %%xmm6          \n\t"
+             "punpcklbw %%xmm7, %%xmm0       \n\t"
+             "movq 528(%%"REG_S"), %%xmm1    \n\t"
+             "punpcklbw %%xmm7, %%xmm1       \n\t"
+             "movq (%%"REG_d"), %%xmm2       \n\t"
+             "punpcklbw %%xmm7, %%xmm2       \n\t"
+             "mov (%%"REG_a"), %%"REG_d"     \n\t"
+             "paddusw %%xmm5, %%xmm6         \n\t"
+             "mov (%%"REG_D"), %%"REG_D"     \n\t"
+             "pmullw %%xmm0, %%xmm2          \n\t"
+             "movq (%%"REG_d"), %%xmm3       \n\t"
+             "mov %0, %%"REG_d"              \n\t"
+             "punpcklbw %%xmm7, %%xmm3       \n\t"
+             "add %2, %%"REG_D"              \n\t"
+             "paddusw %%xmm2, %%xmm6         \n\t"
+             "pmullw %%xmm1, %%xmm3          \n\t"
+             "paddusw %%xmm3, %%xmm6         \n\t"
+
+             "movdqa (%%"REG_D"), %%xmm3     \n\t"
+             "movdqa %%xmm6, %%xmm0          \n\t"
+             "movdqa 16(%%"REG_D"), %%xmm5   \n\t"
+             "punpckhwd %%xmm7, %%xmm6       \n\t"
+             "movq 24(%%"REG_S"), %%xmm1     \n\t"
+             "punpcklwd %%xmm7, %%xmm0       \n\t"
+             "paddd %%xmm0, %%xmm3           \n\t"
+             "paddd %%xmm6, %%xmm5           \n\t"
+             "punpcklbw %%xmm7, %%xmm1       \n\t"
+             "paddd %%xmm4, %%xmm3           \n\t"
+             "paddd %%xmm4, %%xmm5           \n\t"
+             "movq 8(%%"REG_S"), %%xmm0      \n\t"
+             "psrad $8, %%xmm3               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
+
+             "packssdw %%xmm5, %%xmm3        \n\t"
+             "packuswb %%xmm7, %%xmm3        \n\t"
+
+             "movq %%xmm3, (%%"REG_d")       \n\t"
+
+
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%xmm7, %%xmm0       \n\t"
+             "movq 8(%%"REG_d"), %%xmm5      \n\t"
+             "punpcklbw %%xmm7, %%xmm5       \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movq 8(%%"REG_d"), %%xmm6      \n\t"
+             "pmullw %%xmm0, %%xmm5          \n\t"
+             "punpcklbw %%xmm7, %%xmm6       \n\t"
+
+             "movq 520(%%"REG_S"), %%xmm0    \n\t"
+             "pmullw %%xmm1, %%xmm6          \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%xmm7, %%xmm0       \n\t"
+             "movq 536(%%"REG_S"), %%xmm1    \n\t"
+             "punpcklbw %%xmm7, %%xmm1       \n\t"
+             "movq 8(%%"REG_d"), %%xmm2      \n\t"
+             "punpcklbw %%xmm7, %%xmm2       \n\t"
+             "mov (%%"REG_a"), %%"REG_d"     \n\t"
+             "paddusw %%xmm5, %%xmm6         \n\t"
+             "pmullw %%xmm0, %%xmm2          \n\t"
+             "movq 8(%%"REG_d"), %%xmm3      \n\t"
+             "punpcklbw %%xmm7, %%xmm3       \n\t"
+             "paddusw %%xmm2, %%xmm6         \n\t"
+             "pmullw %%xmm1, %%xmm3          \n\t"
+             "paddusw %%xmm3, %%xmm6         \n\t"
+             "mov %0, %%"REG_d"              \n\t"
+
+             "movdqa 32(%%"REG_D"), %%xmm3   \n\t"
+             "movdqa %%xmm6, %%xmm0          \n\t"
+             "movdqa 48(%%"REG_D"), %%xmm5   \n\t"
+             "punpckhwd %%xmm7, %%xmm6       \n\t"
+             "punpcklwd %%xmm7, %%xmm0       \n\t"
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
+             "paddd %%xmm0, %%xmm3           \n\t"
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
+             "paddd %%xmm6, %%xmm5           \n\t"
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
+             "paddd %%xmm4, %%xmm3           \n\t"
+             "add %%"REG_c", (%%"REG_a")     \n\t"
+             "paddd %%xmm4, %%xmm5           \n\t"
+             "psrad $8, %%xmm3               \n\t" /* FRAC_BITS. */
+             "add $"PTR_SIZE"*1, %1          \n\t"
+             "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
+             "add $32, %%"REG_S"             \n\t"
+
+             "packssdw %%xmm5, %%xmm3        \n\t"
+             "add %%"REG_c", %0              \n\t"
+             "packuswb %%xmm7, %%xmm3        \n\t"
+
+             "movq %%xmm3, 8(%%"REG_d")      \n\t"
+
+             "dec %%"REG_b"                  \n\t"
+             "jnz 1b                         \n\t"
+             :
+             :
+             "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):
+             "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+}
+
+static inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    DWTELEM * * dst_array = sb->line + src_y;
+
+    asm volatile(
+             "mov  %6, %%"REG_c"             \n\t"
+             "mov  %5, %%"REG_b"             \n\t"
+             "mov  %3, %%"REG_S"             \n\t"
+             "pcmpeqd %%mm4, %%mm4           \n\t"
+             "pslld $31, %%mm4               \n\t"
+             "pxor %%mm7, %%mm7              \n\t" /* 0 */
+             "psrld $24, %%mm4               \n\t" /* FRAC_BITS >> 1 */
+
+             "1:                              \n\t"
+             "movd (%%"REG_S"), %%mm0        \n\t"
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%mm7, %%mm0         \n\t"
+             "movd 8(%%"REG_S"), %%mm1       \n\t"
+             "punpcklbw %%mm7, %%mm1         \n\t"
+             "movd (%%"REG_d"), %%mm5        \n\t"
+             "mov %1, %%"REG_D"              \n\t"
+             "punpcklbw %%mm7, %%mm5         \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movd (%%"REG_d"), %%mm6        \n\t"
+             "pmullw %%mm0, %%mm5            \n\t"
+             "punpcklbw %%mm7, %%mm6         \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+
+             "movd 128(%%"REG_S"), %%mm0     \n\t"
+             "pmullw %%mm1, %%mm6            \n\t"
+             "punpcklbw %%mm7, %%mm0         \n\t"
+             "movd 136(%%"REG_S"), %%mm1     \n\t"
+             "punpcklbw %%mm7, %%mm1         \n\t"
+             "movd (%%"REG_d"), %%mm2        \n\t"
+             "punpcklbw %%mm7, %%mm2         \n\t"
+             "mov (%%"REG_a"), %%"REG_d"     \n\t"
+             "paddusw %%mm5, %%mm6           \n\t"
+             "mov (%%"REG_D"), %%"REG_D"     \n\t"
+             "pmullw %%mm0, %%mm2            \n\t"
+             "movd (%%"REG_d"), %%mm3        \n\t"
+             "mov %0, %%"REG_d"              \n\t"
+             "punpcklbw %%mm7, %%mm3         \n\t"
+             "add %2, %%"REG_D"              \n\t"
+             "paddusw %%mm2, %%mm6           \n\t"
+             "pmullw %%mm1, %%mm3            \n\t"
+             "paddusw %%mm3, %%mm6           \n\t"
+
+             "movq (%%"REG_D"), %%mm3        \n\t"
+             "movq %%mm6, %%mm0              \n\t"
+             "movq 8(%%"REG_D"), %%mm5       \n\t"
+             "punpckhwd %%mm7, %%mm6         \n\t"
+             "movd 12(%%"REG_S"), %%mm1      \n\t"
+             "punpcklwd %%mm7, %%mm0         \n\t"
+             "paddd %%mm0, %%mm3             \n\t"
+             "paddd %%mm6, %%mm5             \n\t"
+             "punpcklbw %%mm7, %%mm1         \n\t"
+             "paddd %%mm4, %%mm3             \n\t"
+             "paddd %%mm4, %%mm5             \n\t"
+             "movd 4(%%"REG_S"), %%mm0       \n\t"
+             "psrad $8, %%mm3                \n\t" /* FRAC_BITS. */
+             "psrad $8, %%mm5                \n\t" /* FRAC_BITS. */
+
+             "packssdw %%mm5, %%mm3          \n\t"
+             "packuswb %%mm7, %%mm3          \n\t"
+
+             "movd %%mm3, (%%"REG_d")        \n\t"
+
+
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%mm7, %%mm0         \n\t"
+             "movd 4(%%"REG_d"), %%mm5       \n\t"
+             "punpcklbw %%mm7, %%mm5         \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movd 4(%%"REG_d"), %%mm6       \n\t"
+             "pmullw %%mm0, %%mm5            \n\t"
+             "punpcklbw %%mm7, %%mm6         \n\t"
+
+             "movd 132(%%"REG_S"), %%mm0     \n\t"
+             "pmullw %%mm1, %%mm6            \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%mm7, %%mm0         \n\t"
+             "movd 140(%%"REG_S"), %%mm1     \n\t"
+             "punpcklbw %%mm7, %%mm1         \n\t"
+             "movd 4(%%"REG_d"), %%mm2       \n\t"
+             "punpcklbw %%mm7, %%mm2         \n\t"
+             "mov (%%"REG_a"), %%"REG_d"     \n\t"
+             "paddusw %%mm5, %%mm6           \n\t"
+             "pmullw %%mm0, %%mm2            \n\t"
+             "movd 4(%%"REG_d"), %%mm3       \n\t"
+             "punpcklbw %%mm7, %%mm3         \n\t"
+             "paddusw %%mm2, %%mm6           \n\t"
+             "pmullw %%mm1, %%mm3            \n\t"
+             "paddusw %%mm3, %%mm6           \n\t"
+             "mov %0, %%"REG_d"              \n\t"
+
+             "movq 16(%%"REG_D"), %%mm3      \n\t"
+             "movq %%mm6, %%mm0              \n\t"
+             "movq 24(%%"REG_D"), %%mm5      \n\t"
+             "punpckhwd %%mm7, %%mm6         \n\t"
+             "punpcklwd %%mm7, %%mm0         \n\t"
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
+             "paddd %%mm0, %%mm3             \n\t"
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
+             "paddd %%mm6, %%mm5             \n\t"
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
+             "paddd %%mm4, %%mm3             \n\t"
+             "add %%"REG_c", (%%"REG_a")     \n\t"
+             "paddd %%mm4, %%mm5             \n\t"
+             "psrad $8, %%mm3                \n\t" /* FRAC_BITS. */
+             "add $"PTR_SIZE"*1, %1          \n\t"
+             "psrad $8, %%mm5                \n\t" /* FRAC_BITS. */
+             "add $16, %%"REG_S"             \n\t"
+
+             "packssdw %%mm5, %%mm3          \n\t"
+             "add %%"REG_c", %0              \n\t"
+             "packuswb %%mm7, %%mm3          \n\t"
+
+             "movd %%mm3, 4(%%"REG_d")       \n\t"
+
+             "dec %%"REG_b"                  \n\t"
+             "jnz 1b                         \n\t"
+             :
+             :
+             "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):
+             "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    DWTELEM * * dst_array = sb->line + src_y;
+
+    asm volatile(
+             "mov  %6, %%"REG_c"             \n\t"
+             "mov  %5, %%"REG_b"                \n\t"
+             "mov  %3, %%"REG_S"            \n\t"
+             "pcmpeqd %%mm4, %%mm4          \n\t"
+             "pslld $31, %%mm4              \n\t"
+             "pxor %%mm7, %%mm7             \n\t" /* 0 */
+             "psrld $24, %%mm4              \n\t" /* FRAC_BITS >> 1 */
+
+             "1:                              \n\t"
+             "movd (%%"REG_S"), %%mm0        \n\t"
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%mm7, %%mm0        \n\t"
+             "movd 16(%%"REG_S"), %%mm1     \n\t"
+             "punpcklbw %%mm7, %%mm1        \n\t"
+             "movd (%%"REG_d"), %%mm5       \n\t"
+             "mov %1, %%"REG_D"             \n\t"
+             "punpcklbw %%mm7, %%mm5        \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movd (%%"REG_d"), %%mm6       \n\t"
+             "pmullw %%mm0, %%mm5           \n\t"
+             "punpcklbw %%mm7, %%mm6        \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+
+             "movd 512(%%"REG_S"), %%mm0    \n\t"
+             "pmullw %%mm1, %%mm6           \n\t"
+             "punpcklbw %%mm7, %%mm0        \n\t"
+             "movd 528(%%"REG_S"), %%mm1    \n\t"
+             "punpcklbw %%mm7, %%mm1        \n\t"
+             "movd (%%"REG_d"), %%mm2       \n\t"
+             "punpcklbw %%mm7, %%mm2        \n\t"
+             "mov (%%"REG_a"), %%"REG_d"    \n\t"
+             "paddusw %%mm5, %%mm6          \n\t"
+             "mov (%%"REG_D"), %%"REG_D"    \n\t"
+             "pmullw %%mm0, %%mm2           \n\t"
+             "movd (%%"REG_d"), %%mm3       \n\t"
+             "mov %0, %%"REG_d"             \n\t"
+             "punpcklbw %%mm7, %%mm3        \n\t"
+             "add %2, %%"REG_D"             \n\t"
+             "paddusw %%mm2, %%mm6          \n\t"
+             "pmullw %%mm1, %%mm3           \n\t"
+             "paddusw %%mm3, %%mm6          \n\t"
+
+             "movq (%%"REG_D"), %%mm3       \n\t"
+             "movq %%mm6, %%mm0             \n\t"
+             "movq 8(%%"REG_D"), %%mm5      \n\t"
+             "punpckhwd %%mm7, %%mm6        \n\t"
+             "movd 20(%%"REG_S"), %%mm1     \n\t"
+             "punpcklwd %%mm7, %%mm0        \n\t"
+             "paddd %%mm0, %%mm3            \n\t"
+             "paddd %%mm6, %%mm5            \n\t"
+             "punpcklbw %%mm7, %%mm1        \n\t"
+             "paddd %%mm4, %%mm3            \n\t"
+             "paddd %%mm4, %%mm5            \n\t"
+             "movd 4(%%"REG_S"), %%mm0      \n\t"
+             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
+
+             "packssdw %%mm5, %%mm3         \n\t"
+             "packuswb %%mm7, %%mm3         \n\t"
+
+             "movd %%mm3, (%%"REG_d")       \n\t"
+
+
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%mm7, %%mm0        \n\t"
+             "movd 4(%%"REG_d"), %%mm5      \n\t"
+             "punpcklbw %%mm7, %%mm5        \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movd 4(%%"REG_d"), %%mm6      \n\t"
+             "pmullw %%mm0, %%mm5           \n\t"
+             "punpcklbw %%mm7, %%mm6        \n\t"
+
+             "movd 516(%%"REG_S"), %%mm0    \n\t"
+             "pmullw %%mm1, %%mm6           \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%mm7, %%mm0        \n\t"
+             "movd 532(%%"REG_S"), %%mm1    \n\t"
+             "punpcklbw %%mm7, %%mm1        \n\t"
+             "movd 4(%%"REG_d"), %%mm2      \n\t"
+             "punpcklbw %%mm7, %%mm2        \n\t"
+             "mov (%%"REG_a"), %%"REG_d"     \n\t"
+             "paddusw %%mm5, %%mm6          \n\t"
+             "pmullw %%mm0, %%mm2           \n\t"
+             "movd 4(%%"REG_d"), %%mm3      \n\t"
+             "punpcklbw %%mm7, %%mm3        \n\t"
+             "paddusw %%mm2, %%mm6          \n\t"
+             "pmullw %%mm1, %%mm3           \n\t"
+             "paddusw %%mm3, %%mm6          \n\t"
+             "mov %0, %%"REG_d"             \n\t"
+
+             "movq 16(%%"REG_D"), %%mm3     \n\t"
+             "movq %%mm6, %%mm0             \n\t"
+             "movq 24(%%"REG_D"), %%mm5     \n\t"
+             "punpckhwd %%mm7, %%mm6        \n\t"
+             "punpcklwd %%mm7, %%mm0        \n\t"
+             "paddd %%mm0, %%mm3            \n\t"
+             "paddd %%mm6, %%mm5            \n\t"
+             "paddd %%mm4, %%mm3            \n\t"
+             "paddd %%mm4, %%mm5            \n\t"
+             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
+
+             "packssdw %%mm5, %%mm3         \n\t"
+             "packuswb %%mm7, %%mm3         \n\t"
+
+             "movd %%mm3, 4(%%"REG_d")      \n\t"
+
+
+
+             "movd 8(%%"REG_S"), %%mm0      \n\t"
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%mm7, %%mm0        \n\t"
+             "movd 24(%%"REG_S"), %%mm1     \n\t"
+             "punpcklbw %%mm7, %%mm1        \n\t"
+             "movd 8(%%"REG_d"), %%mm5      \n\t"
+             "punpcklbw %%mm7, %%mm5        \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movd 8(%%"REG_d"), %%mm6      \n\t"
+             "pmullw %%mm0, %%mm5           \n\t"
+             "punpcklbw %%mm7, %%mm6        \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+
+             "movd 520(%%"REG_S"), %%mm0    \n\t"
+             "pmullw %%mm1, %%mm6           \n\t"
+             "punpcklbw %%mm7, %%mm0        \n\t"
+             "movd 536(%%"REG_S"), %%mm1    \n\t"
+             "punpcklbw %%mm7, %%mm1        \n\t"
+             "movd 8(%%"REG_d"), %%mm2      \n\t"
+             "punpcklbw %%mm7, %%mm2        \n\t"
+             "mov (%%"REG_a"), %%"REG_d"    \n\t"
+             "paddusw %%mm5, %%mm6          \n\t"
+             "pmullw %%mm0, %%mm2           \n\t"
+             "movd 8(%%"REG_d"), %%mm3      \n\t"
+             "mov %0, %%"REG_d"             \n\t"
+             "punpcklbw %%mm7, %%mm3        \n\t"
+             "paddusw %%mm2, %%mm6          \n\t"
+             "pmullw %%mm1, %%mm3           \n\t"
+             "paddusw %%mm3, %%mm6          \n\t"
+
+             "movq 32(%%"REG_D"), %%mm3     \n\t"
+             "movq %%mm6, %%mm0             \n\t"
+             "movq 40(%%"REG_D"), %%mm5     \n\t"
+             "punpckhwd %%mm7, %%mm6        \n\t"
+             "movd 28(%%"REG_S"), %%mm1     \n\t"
+             "punpcklwd %%mm7, %%mm0        \n\t"
+             "paddd %%mm0, %%mm3            \n\t"
+             "paddd %%mm6, %%mm5            \n\t"
+             "punpcklbw %%mm7, %%mm1        \n\t"
+             "paddd %%mm4, %%mm3            \n\t"
+             "paddd %%mm4, %%mm5            \n\t"
+             "movd 12(%%"REG_S"), %%mm0     \n\t"
+             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
+
+             "packssdw %%mm5, %%mm3         \n\t"
+             "packuswb %%mm7, %%mm3         \n\t"
+
+             "movd %%mm3, 8(%%"REG_d")      \n\t"
+
+
+             "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%mm7, %%mm0        \n\t"
+             "movd 12(%%"REG_d"), %%mm5     \n\t"
+             "punpcklbw %%mm7, %%mm5        \n\t"
+             "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+             "movd 12(%%"REG_d"), %%mm6     \n\t"
+             "pmullw %%mm0, %%mm5           \n\t"
+             "punpcklbw %%mm7, %%mm6        \n\t"
+
+             "movd 524(%%"REG_S"), %%mm0    \n\t"
+             "pmullw %%mm1, %%mm6           \n\t"
+             "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+             "punpcklbw %%mm7, %%mm0        \n\t"
+             "movd 540(%%"REG_S"), %%mm1    \n\t"
+             "punpcklbw %%mm7, %%mm1        \n\t"
+             "movd 12(%%"REG_d"), %%mm2     \n\t"
+             "punpcklbw %%mm7, %%mm2        \n\t"
+             "mov (%%"REG_a"), %%"REG_d"    \n\t"
+             "paddusw %%mm5, %%mm6          \n\t"
+             "pmullw %%mm0, %%mm2           \n\t"
+             "movd 12(%%"REG_d"), %%mm3     \n\t"
+             "punpcklbw %%mm7, %%mm3        \n\t"
+             "paddusw %%mm2, %%mm6          \n\t"
+             "pmullw %%mm1, %%mm3           \n\t"
+             "paddusw %%mm3, %%mm6          \n\t"
+             "mov %0, %%"REG_d"             \n\t"
+
+             "movq 48(%%"REG_D"), %%mm3     \n\t"
+             "movq %%mm6, %%mm0             \n\t"
+             "movq 56(%%"REG_D"), %%mm5     \n\t"
+             "punpckhwd %%mm7, %%mm6        \n\t"
+             "punpcklwd %%mm7, %%mm0        \n\t"
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
+             "paddd %%mm0, %%mm3            \n\t"
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
+             "paddd %%mm6, %%mm5            \n\t"
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
+             "paddd %%mm4, %%mm3            \n\t"
+             "add %%"REG_c", (%%"REG_a")    \n\t"
+             "paddd %%mm4, %%mm5            \n\t"
+             "psrad $8, %%mm3               \n\t" /* FRAC_BITS. */
+             "add $"PTR_SIZE"*1, %1         \n\t"
+             "psrad $8, %%mm5               \n\t" /* FRAC_BITS. */
+             "add $32, %%"REG_S"            \n\t"
+
+             "packssdw %%mm5, %%mm3         \n\t"
+             "add %%"REG_c", %0             \n\t"
+             "packuswb %%mm7, %%mm3         \n\t"
+
+             "movd %%mm3, 12(%%"REG_d")     \n\t"
+
+             "dec %%"REG_b"                 \n\t"
+             "jnz 1b                        \n\t"
+             :
+             :
+             "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):
+             "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+}
+
+void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16) {
+        if (!(b_h & 1))
+            inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+        else
+            inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    } else
+         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16)
+        inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else
+        ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}