[Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations
Oded Shimon
ods15
Fri Mar 10 18:41:36 CET 2006
On Thu, Mar 09, 2006 at 07:14:24PM -0500, Robert Edele wrote:
> On Thu, 2006-03-09 at 15:23 -0800, Loren Merritt wrote:
> > On Thu, 9 Mar 2006, Robert Edele wrote:
> >
> > > I've removed interleave_line_footer which is never invoked, as it is not
> > > used by the asm cases and Michael's code is used for the C case.
> > >
> > > Michael, I believe that the patch is ready to be reviewed and hopefully
> > > committed. There might be a small bug or two left, but the code looks
> > > good to my eyes and has been tested on AMD-64, P4, and AMD-32 systems
> > > with no issues (no crashes, and md5sums all agree). Regression tests on
> > > my machine all pass (C, mmx, sse2).
> > >
> > > If there are any more remaining issues with the code, please let me
> > > know. Thanks for your patience.
> >
> > >+ if (!(b_h & 1))
> > >+ inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride,
> > >+ block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
> > >+ else
> > >+ inner_add_yblock_bw_8_obmc_16_bh_even_mmx(obmc, obmc_stride,
> > >+ block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
> >
> > inner_add_yblock_bw_8_obmc_16_bh_even_mmx is called when bh is odd?
> >
> > --Loren Merritt
>
> That must have been an oversight by Oded (ods15) while helping me out in
> porting the code out of snow.c and renaming that function after reading
> the corresponding sse2 function (which does require even b_h). The mmx
> version handles either odd or even just fine and it was never written
> with the word 'even' in it. On the plus side, he is the one that thought
> of using the mmx as a fallback should b_h be odd instead of falling back
> on the C version.
Heh, I didn't think of anything, I just copied the behavior it was when you
sent me the non dsputil patch...
On a completely different note, I have optimized mc_block . The C version
is 30-50% faster, and the MMX version is 70% faster or so. qpel is no
longer expensive in snow... I merged the 2 patches here... md5sum checks
out. I was worried about odd resolutions, but it seems Snow has stopped
supporting any resolution not divisable by 8...
- ods15
-------------- next part --------------
Index: libavcodec/snow.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/snow.c,v
retrieving revision 1.87
diff -u -r1.87 snow.c
--- libavcodec/snow.c 30 Jan 2006 23:33:18 -0000 1.87
+++ libavcodec/snow.c 10 Mar 2006 16:36:15 -0000
@@ -19,23 +19,15 @@
#include "avcodec.h"
#include "common.h"
#include "dsputil.h"
+#include "snow.h"
#include "rangecoder.h"
-#define MID_STATE 128
#include "mpegvideo.h"
#undef NDEBUG
#include <assert.h>
-#define MAX_DECOMPOSITIONS 8
-#define MAX_PLANES 4
-#define DWTELEM int
-#define QSHIFT 5
-#define QROOT (1<<QSHIFT)
-#define LOSSLESS_QLOG -128
-#define FRAC_BITS 8
-
static const int8_t quant3[256]={
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -181,7 +173,7 @@
-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
};
-#define LOG2_OBMC_MAX 6
+#define LOG2_OBMC_MAX 8
#define OBMC_MAX (1<<(LOG2_OBMC_MAX))
#if 0 //64*cubic
static const uint8_t obmc32[1024]={
@@ -240,6 +232,39 @@
};
#elif 1 // 64*linear
static const uint8_t obmc32[1024]={
+ 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0,
+ 0, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12, 8, 8, 8, 4, 4, 4, 0,
+ 0, 4, 8, 8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12, 8, 8, 4, 0,
+ 0, 4, 8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12, 8, 4, 0,
+ 4, 8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12, 8, 4,
+ 4, 8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12, 8, 4,
+ 4, 8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16, 8, 4,
+ 4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12, 4,
+ 4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12, 4,
+ 4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16, 4,
+ 4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16, 4,
+ 4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16, 4,
+ 8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20, 8,
+ 8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20, 8,
+ 8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20, 8,
+ 8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24, 8,
+ 8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24, 8,
+ 8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20, 8,
+ 8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20, 8,
+ 8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20, 8,
+ 4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16, 4,
+ 4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16, 4,
+ 4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16, 4,
+ 4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12, 4,
+ 4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12, 4,
+ 4, 8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16, 8, 4,
+ 4, 8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12, 8, 4,
+ 4, 8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12, 8, 4,
+ 0, 4, 8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12, 8, 4, 0,
+ 0, 4, 8, 8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12, 8, 8, 4, 0,
+ 0, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12, 8, 8, 8, 4, 4, 4, 0,
+ 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0,
+/*
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
@@ -272,9 +297,27 @@
0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+*/
//error:0.000020
};
static const uint8_t obmc16[256]={
+ 0, 4, 4, 8, 8, 12, 12, 16, 16, 12, 12, 8, 8, 4, 4, 0,
+ 4, 8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16, 8, 4,
+ 4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16, 4,
+ 8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20, 8,
+ 8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28, 8,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+ 8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28, 8,
+ 8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20, 8,
+ 4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16, 4,
+ 4, 8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16, 8, 4,
+ 0, 4, 4, 8, 8, 12, 12, 16, 16, 12, 12, 8, 8, 4, 4, 0,
+/*
0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
@@ -291,6 +334,7 @@
1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
+*/
//error:0.000015
};
#else //64*cos
@@ -352,6 +396,15 @@
//linear *64
static const uint8_t obmc8[64]={
+ 4, 12, 20, 28, 28, 20, 12, 4,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+ 20, 60,100,140,140,100, 60, 20,
+ 28, 84,140,196,196,140, 84, 28,
+ 28, 84,140,196,196,140, 84, 28,
+ 20, 60,100,140,140,100, 60, 20,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+ 4, 12, 20, 28, 28, 20, 12, 4,
+/*
1, 3, 5, 7, 7, 5, 3, 1,
3, 9,15,21,21,15, 9, 3,
5,15,25,35,35,25,15, 5,
@@ -360,15 +413,22 @@
5,15,25,35,35,25,15, 5,
3, 9,15,21,21,15, 9, 3,
1, 3, 5, 7, 7, 5, 3, 1,
+*/
//error:0.000000
};
//linear *64
static const uint8_t obmc4[16]={
+ 16, 48, 48, 16,
+ 48,144,144, 48,
+ 48,144,144, 48,
+ 16, 48, 48, 16,
+ /*
4,12,12, 4,
12,36,36,12,
12,36,36,12,
4,12,12, 4,
+ */
//error:0.000000
};
@@ -425,17 +485,6 @@
SubBand band[MAX_DECOMPOSITIONS][4];
}Plane;
-/** Used to minimize the amount of memory used in order to optimize cache performance. **/
-typedef struct {
- DWTELEM * * line; ///< For use by idwt and predict_slices.
- DWTELEM * * data_stack; ///< Used for internal purposes.
- int data_stack_top;
- int line_count;
- int line_width;
- int data_count;
- DWTELEM * base_buffer; ///< Buffer that this structure is caching.
-} slice_buffer;
-
typedef struct SnowContext{
// MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
@@ -741,6 +790,7 @@
}
}
+#ifndef lift5
static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
const int mirror_left= !highpass;
const int mirror_right= (width&1) ^ highpass;
@@ -770,7 +820,9 @@
dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
}
}
+#endif
+#ifndef liftS
static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
const int mirror_left= !highpass;
const int mirror_right= (width&1) ^ highpass;
@@ -793,6 +845,7 @@
dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
}
}
+#endif
static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
@@ -1111,76 +1164,6 @@
}
}
-#define liftS lift
-#define lift5 lift
-#if 1
-#define W_AM 3
-#define W_AO 0
-#define W_AS 1
-
-#undef liftS
-#define W_BM 1
-#define W_BO 8
-#define W_BS 4
-
-#define W_CM 1
-#define W_CO 0
-#define W_CS 0
-
-#define W_DM 3
-#define W_DO 4
-#define W_DS 3
-#elif 0
-#define W_AM 55
-#define W_AO 16
-#define W_AS 5
-
-#define W_BM 3
-#define W_BO 32
-#define W_BS 6
-
-#define W_CM 127
-#define W_CO 64
-#define W_CS 7
-
-#define W_DM 7
-#define W_DO 8
-#define W_DS 4
-#elif 0
-#define W_AM 97
-#define W_AO 32
-#define W_AS 6
-
-#define W_BM 63
-#define W_BO 512
-#define W_BS 10
-
-#define W_CM 13
-#define W_CO 8
-#define W_CS 4
-
-#define W_DM 15
-#define W_DO 16
-#define W_DS 5
-
-#else
-
-#define W_AM 203
-#define W_AO 64
-#define W_AS 7
-
-#define W_BM 217
-#define W_BO 2048
-#define W_BS 12
-
-#define W_CM 113
-#define W_CO 64
-#define W_CS 7
-
-#define W_DM 227
-#define W_DO 128
-#define W_DS 9
-#endif
static void horizontal_decompose97i(DWTELEM *b, int width){
DWTELEM temp[width];
const int w2= (width+1)>>1;
@@ -1410,7 +1393,7 @@
}
-static void horizontal_compose97i(DWTELEM *b, int width){
+void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
DWTELEM temp[width];
const int w2= (width+1)>>1;
@@ -1463,7 +1446,7 @@
}
}
-static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
+void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
int i;
for(i=0; i<width; i++){
@@ -1504,7 +1487,7 @@
cs->y = -3;
}
-static void spatial_compose97i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
+static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
int y = cs->y;
DWTELEM *b0= cs->b0;
@@ -1516,7 +1499,7 @@
{START_TIMER
if(y>0 && y+4<height){
- vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
+ dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
}else{
if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
@@ -1527,8 +1510,8 @@
STOP_TIMER("vertical_compose97i")}}
{START_TIMER
- if(y-1<(unsigned)height) horizontal_compose97i(b0, width);
- if(y+0<(unsigned)height) horizontal_compose97i(b1, width);
+ if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
+ if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
if(width>400 && y+0<(unsigned)height){
STOP_TIMER("horizontal_compose97i")}}
@@ -1557,8 +1540,8 @@
STOP_TIMER("vertical_compose97i")}}
{START_TIMER
- if(y-1<(unsigned)height) horizontal_compose97i(b0, width);
- if(y+0<(unsigned)height) horizontal_compose97i(b1, width);
+ if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
+ if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
if(width>400 && b0 <= b2){
STOP_TIMER("horizontal_compose97i")}}
@@ -1619,7 +1602,7 @@
}
}
-static void ff_spatial_idwt_buffered_slice(dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
+static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
const int support = type==1 ? 3 : 5;
int level;
if(type==2) return;
@@ -1627,7 +1610,7 @@
for(level=decomposition_count-1; level>=0; level--){
while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
switch(type){
- case 0: spatial_compose97i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
+ case 0: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
break;
case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
break;
@@ -2381,91 +2364,6 @@
}
}
-static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
- int x, y;
-START_TIMER
- for(y=0; y < b_h+5; y++){
- for(x=0; x < b_w; x++){
- int a0= src[x ];
- int a1= src[x + 1];
- int a2= src[x + 2];
- int a3= src[x + 3];
- int a4= src[x + 4];
- int a5= src[x + 5];
-// int am= 9*(a1+a2) - (a0+a3);
- int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
-// int am= 18*(a2+a3) - 2*(a1+a4);
-// int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
-// int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
-
-// if(b_w==16) am= 8*(a1+a2);
-
- if(dx<8) am = (32*a2*( 8-dx) + am* dx + 128)>>8;
- else am = ( am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
-
- /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
- if(am&(~255)) am= ~(am>>31);
-
- tmp[x] = am;
-
-/* if (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) + aL* dx + 32)>>6;
- else if(dx< 8) tmp[x + y*stride]= ( aL*( 8-dx) + am*(dx- 4) + 32)>>6;
- else if(dx<12) tmp[x + y*stride]= ( am*(12-dx) + aR*(dx- 8) + 32)>>6;
- else tmp[x + y*stride]= ( aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
- }
- tmp += stride;
- src += stride;
- }
- tmp -= (b_h+5)*stride;
-
- for(y=0; y < b_h; y++){
- for(x=0; x < b_w; x++){
- int a0= tmp[x + 0*stride];
- int a1= tmp[x + 1*stride];
- int a2= tmp[x + 2*stride];
- int a3= tmp[x + 3*stride];
- int a4= tmp[x + 4*stride];
- int a5= tmp[x + 5*stride];
- int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
-// int am= 18*(a2+a3) - 2*(a1+a4);
-/* int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
- int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
-
-// if(b_w==16) am= 8*(a1+a2);
-
- if(dy<8) am = (32*a2*( 8-dy) + am* dy + 128)>>8;
- else am = ( am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
-
- if(am&(~255)) am= ~(am>>31);
-
- dst[x] = am;
-/* if (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) + aL* dy + 32)>>6;
- else if(dy< 8) tmp[x + y*stride]= ( aL*( 8-dy) + am*(dy- 4) + 32)>>6;
- else if(dy<12) tmp[x + y*stride]= ( am*(12-dy) + aR*(dy- 8) + 32)>>6;
- else tmp[x + y*stride]= ( aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
- }
- dst += stride;
- tmp += stride;
- }
-STOP_TIMER("mc_block")
-}
-
-#define mca(dx,dy,b_w)\
-static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
- uint8_t tmp[stride*(b_w+5)];\
- assert(h==b_w);\
- mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
-}
-
-mca( 0, 0,16)
-mca( 8, 0,16)
-mca( 0, 8,16)
-mca( 8, 8,16)
-mca( 0, 0,8)
-mca( 8, 0,8)
-mca( 0, 8,8)
-mca( 8, 8,8)
-
static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
if(block->type & BLOCK_INTRA){
int x, y;
@@ -2524,9 +2422,14 @@
assert(!(b_w&(b_w-1)));
assert(b_w>1 && b_h>1);
assert(tab_index>=0 && tab_index<4 || b_w==32);
- if((dx&3) || (dy&3))
- mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
- else if(b_w==32){
+ if((dx&3) || (dy&3)) {
+ START_TIMER
+ assert(!(dx&1) && !(dy&1));
+ assert(dx<16 && dy<16);
+ s->dsp.mc_block_x[dx>>1](dst, src, tmp, stride, b_w, b_h, dx, dy);
+ s->dsp.mc_block_y[dy>>1](dst, src, tmp, stride, b_w, b_h, dx, dy);
+ STOP_TIMER("mc_block")
+ } else if(b_w==32){
int y;
for(y=0; y<b_h; y+=16){
s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
@@ -2545,6 +2448,40 @@
}
}
+void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ int y, x;
+ DWTELEM * dst;
+ for(y=0; y<b_h; y++){
+ //FIXME ugly missue of obmc_stride
+ uint8_t *obmc1= obmc + y*obmc_stride;
+ uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+ uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+ uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+ dst = slice_buffer_get_line(sb, src_y + y);
+ for(x=0; x<b_w; x++){
+ int v= obmc1[x] * block[3][x + y*src_stride]
+ +obmc2[x] * block[2][x + y*src_stride]
+ +obmc3[x] * block[1][x + y*src_stride]
+ +obmc4[x] * block[0][x + y*src_stride];
+
+ v <<= 8 - LOG2_OBMC_MAX;
+ if(FRAC_BITS != 8){
+ v += 1<<(7 - FRAC_BITS);
+ v >>= 8 - FRAC_BITS;
+ }
+ if(add){
+ v += dst[x + src_x];
+ v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+ if(v&(~255)) v= ~(v>>31);
+ dst8[x + y*src_stride] = v;
+ }else{
+ dst[x + src_x] -= v;
+ }
+ }
+ }
+}
+
//FIXME name clenup (b_w, block_w, b_width stuff)
static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
DWTELEM * dst = NULL;
@@ -2669,36 +2606,7 @@
START_TIMER
- for(y=0; y<b_h; y++){
- //FIXME ugly missue of obmc_stride
- uint8_t *obmc1= obmc + y*obmc_stride;
- uint8_t *obmc2= obmc1+ (obmc_stride>>1);
- uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
- uint8_t *obmc4= obmc3+ (obmc_stride>>1);
- dst = slice_buffer_get_line(sb, src_y + y);
- for(x=0; x<b_w; x++){
- int v= obmc1[x] * block[3][x + y*src_stride]
- +obmc2[x] * block[2][x + y*src_stride]
- +obmc3[x] * block[1][x + y*src_stride]
- +obmc4[x] * block[0][x + y*src_stride];
-
- v <<= 8 - LOG2_OBMC_MAX;
- if(FRAC_BITS != 8){
- v += 1<<(7 - FRAC_BITS);
- v >>= 8 - FRAC_BITS;
- }
- if(add){
-// v += old_dst[x + y*dst_stride];
- v += dst[x + src_x];
- v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
- if(v&(~255)) v= ~(v>>31);
- dst8[x + y*src_stride] = v;
- }else{
-// old_dst[x + y*dst_stride] -= v;
- dst[x + src_x] -= v;
- }
- }
- }
+ s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
STOP_TIMER("Inner add y block")
}
#endif
@@ -3044,7 +2952,7 @@
}
*b= backup;
- return clip(((ab<<6) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
+ return clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
}
static inline int get_block_bits(SnowContext *s, int x, int y, int w){
@@ -3104,10 +3012,10 @@
const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
int sx= block_w*mb_x - block_w/2;
int sy= block_w*mb_y - block_w/2;
- const int x0= FFMAX(0,-sx);
- const int y0= FFMAX(0,-sy);
- const int x1= FFMIN(block_w*2, w-sx);
- const int y1= FFMIN(block_w*2, h-sy);
+ int x0= FFMAX(0,-sx);
+ int y0= FFMAX(0,-sy);
+ int x1= FFMIN(block_w*2, w-sx);
+ int y1= FFMIN(block_w*2, h-sy);
int i,x,y;
pred_block(s, cur, ref, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
@@ -3125,6 +3033,22 @@
}
}
+ /* copy the regions where obmc[] = (uint8_t)256 */
+ if(LOG2_OBMC_MAX == 8
+ && (mb_x == 0 || mb_x == b_stride-1)
+ && (mb_y == 0 || mb_y == b_height-1)){
+ if(mb_x == 0)
+ x1 = block_w;
+ else
+ x0 = block_w;
+ if(mb_y == 0)
+ y1 = block_w;
+ else
+ y0 = block_w;
+ for(y=y0; y<y1; y++)
+ memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
+ }
+
//FIXME sad/ssd can be broken up, but wavelet cmp should be one 32x32 block
if(block_w==16){
distortion = 0;
@@ -3820,19 +3744,6 @@
mcf( 8,12)
mcf(12,12)
-#define mcfh(dx,dy)\
- s->dsp.put_pixels_tab [0][dy/4+dx/8]=\
- s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
- mc_block_hpel ## dx ## dy ## 16;\
- s->dsp.put_pixels_tab [1][dy/4+dx/8]=\
- s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
- mc_block_hpel ## dx ## dy ## 8;
-
- mcfh(0, 0)
- mcfh(8, 0)
- mcfh(0, 8)
- mcfh(8, 8)
-
if(!qexp[0])
init_qexp();
@@ -4387,7 +4298,7 @@
{ START_TIMER
for(; yd<slice_h; yd+=4){
- ff_spatial_idwt_buffered_slice(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
+ ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
}
STOP_TIMER("idwt slice");}
Index: libavcodec/dsputil.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.c,v
retrieving revision 1.134
diff -u -r1.134 dsputil.c
--- libavcodec/dsputil.c 10 Feb 2006 06:55:24 -0000 1.134
+++ libavcodec/dsputil.c 10 Mar 2006 16:36:16 -0000
@@ -3772,6 +3772,77 @@
dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
}
+static always_inline void mc_block_x(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+ int x, y;
+ if (dy == 0) {
+ b_h -= 5;
+ tmp = dst;
+ src += 2*stride;
+ }
+ if (dx != 0) for(y=0; y < b_h+5; y++){
+ for(x=0; x < b_w; x++){
+ int a0= src[x ];
+ int a1= src[x + 1];
+ int a2= src[x + 2];
+ int a3= src[x + 3];
+ int a4= src[x + 4];
+ int a5= src[x + 5];
+ int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+
+ if(dx<8) am = (32*a2*( 8-dx) + am* dx + 128)>>8;
+ else am = ( am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
+
+ if(am&(~255)) am= ~(am>>31);
+
+ tmp[x] = am;
+ }
+ tmp += stride;
+ src += stride;
+ }
+}
+
+static always_inline void mc_block_y(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+ int x, y;
+ if (dx == 0) tmp = (uint8_t*)src + 2;
+ if (dy != 0) for(y=0; y < b_h; y++){
+ for(x=0; x < b_w; x++){
+ int a0= tmp[x + 0*stride];
+ int a1= tmp[x + 1*stride];
+ int a2= tmp[x + 2*stride];
+ int a3= tmp[x + 3*stride];
+ int a4= tmp[x + 4*stride];
+ int a5= tmp[x + 5*stride];
+ int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+
+ if(dy<8) am = (32*a2*( 8-dy) + am* dy + 128)>>8;
+ else am = ( am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
+
+ if(am&(~255)) am= ~(am>>31);
+
+ dst[x] = am;
+ }
+ dst += stride;
+ tmp += stride;
+ }
+}
+
+#define mca(a)\
+static void mc_block_x ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+ mc_block_x(dst, src, tmp, stride, b_w, b_h, a, dy);\
+}\
+static void mc_block_y ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+ mc_block_y(dst, src, tmp, stride, b_w, b_h, dx, a);\
+}
+
+mca(0)
+mca(2)
+mca(4)
+mca(6)
+mca(8)
+mca(10)
+mca(12)
+mca(14)
+
/* init static data */
void dsputil_static_init(void)
{
@@ -4047,6 +4118,28 @@
c->try_8x8basis= try_8x8basis_c;
c->add_8x8basis= add_8x8basis_c;
+ c->vertical_compose97i = ff_snow_vertical_compose97i;
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i;
+ c->inner_add_yblock = ff_snow_inner_add_yblock;
+
+ c->mc_block_x[0] = mc_block_x0;
+ c->mc_block_x[1] = mc_block_x2;
+ c->mc_block_x[2] = mc_block_x4;
+ c->mc_block_x[3] = mc_block_x6;
+ c->mc_block_x[4] = mc_block_x8;
+ c->mc_block_x[5] = mc_block_x10;
+ c->mc_block_x[6] = mc_block_x12;
+ c->mc_block_x[7] = mc_block_x14;
+
+ c->mc_block_y[0] = mc_block_y0;
+ c->mc_block_y[1] = mc_block_y2;
+ c->mc_block_y[2] = mc_block_y4;
+ c->mc_block_y[3] = mc_block_y6;
+ c->mc_block_y[4] = mc_block_y8;
+ c->mc_block_y[5] = mc_block_y10;
+ c->mc_block_y[6] = mc_block_y12;
+ c->mc_block_y[7] = mc_block_y14;
+
#ifdef HAVE_MMX
dsputil_init_mmx(c, avctx);
#endif
Index: libavcodec/dsputil.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.h,v
retrieving revision 1.129
diff -u -r1.129 dsputil.h
--- libavcodec/dsputil.h 8 Mar 2006 04:13:55 -0000 1.129
+++ libavcodec/dsputil.h 10 Mar 2006 16:36:17 -0000
@@ -30,6 +30,7 @@
#include "common.h"
#include "avcodec.h"
+#include "snow.h"
//#define DEBUG
@@ -132,6 +133,8 @@
// allthough currently h<4 is not used as functions with width <8 are not used and neither implemented
typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
+typedef void (*mc_block_func)(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy);
+
/**
* DSPContext.
@@ -334,6 +337,14 @@
void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+
+ /* snow wavelet */
+ void (*vertical_compose97i)(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+ void (*horizontal_compose97i)(DWTELEM *b, int width);
+ void (*inner_add_yblock)(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+ mc_block_func mc_block_x[8];
+ mc_block_func mc_block_y[8];
} DSPContext;
void dsputil_static_init(void);
Index: libavcodec/Makefile
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/Makefile,v
retrieving revision 1.244
diff -u -r1.244 Makefile
--- libavcodec/Makefile 8 Mar 2006 04:13:55 -0000 1.244
+++ libavcodec/Makefile 10 Mar 2006 16:36:17 -0000
@@ -330,7 +330,7 @@
i386/dsputil_mmx.o i386/mpegvideo_mmx.o \
i386/idct_mmx.o i386/motion_est_mmx.o \
i386/simple_idct_mmx.o i386/fft_sse.o i386/vp3dsp_mmx.o \
- i386/vp3dsp_sse2.o i386/fft_3dn.o i386/fft_3dn2.o
+ i386/vp3dsp_sse2.o i386/fft_3dn.o i386/fft_3dn2.o i386/snowdsp_mmx.o
ifeq ($(CONFIG_GPL),yes)
OBJS += i386/idct_mmx_xvid.o
endif
Index: libavcodec/i386/mmx.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/mmx.h,v
retrieving revision 1.7
diff -u -r1.7 mmx.h
--- libavcodec/i386/mmx.h 22 Dec 2005 01:10:09 -0000 1.7
+++ libavcodec/i386/mmx.h 10 Mar 2006 16:36:17 -0000
@@ -12,6 +12,7 @@
# define REG_d "rdx"
# define REG_D "rdi"
# define REG_S "rsi"
+# define PTR_SIZE "8"
#else
# define REG_a "eax"
# define REG_b "ebx"
@@ -19,6 +20,7 @@
# define REG_d "edx"
# define REG_D "edi"
# define REG_S "esi"
+# define PTR_SIZE "4"
#endif
/*
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx.c,v
retrieving revision 1.113
diff -u -r1.113 dsputil_mmx.c
--- libavcodec/i386/dsputil_mmx.c 7 Mar 2006 22:45:56 -0000 1.113
+++ libavcodec/i386/dsputil_mmx.c 10 Mar 2006 16:36:19 -0000
@@ -2564,6 +2564,150 @@
}
#endif
+static always_inline void mc_block_core(int dx, uint8_t * dst) {
+ asm volatile("punpcklbw %%mm6, %%mm0 \n\t"
+ "punpcklbw %%mm6, %%mm1 \n\t"
+ "punpcklbw %%mm6, %%mm2 \n\t"
+ "punpcklbw %%mm6, %%mm3 \n\t"
+ "punpcklbw %%mm6, %%mm4 \n\t"
+ "punpcklbw %%mm6, %%mm5 \n\t"
+
+ "paddsw %%mm5, %%mm0 \n\t" // am += a5;
+
+ "paddsw %%mm4, %%mm1 \n\t" // a1 += a4;
+ "movq %%mm1, %%mm4 \n\t" // a4 = a1;
+ "psllw $2, %%mm1 \n\t" // a1 *= 4;
+ "paddsw %%mm4, %%mm1 \n\t" // a1 += a4;
+ "psubsw %%mm1, %%mm0 \n\t" // am -= a1;
+
+ "movq %%mm2, %%mm4 \n\t" // a4 = a2;
+ "paddsw %%mm3, %%mm4 \n\t" // a4 += a3;
+
+ "psllw $2, %%mm4 \n\t" // a4 *= 4;
+ "paddsw %%mm4, %%mm0 \n\t" // am += a1;
+ "psllw $2, %%mm4 \n\t" // a4 *= 4;
+ "paddsw %%mm4, %%mm0 \n\t" // am += a1;
+ ::);
+ switch (dx) {
+ case 2: asm volatile("psllw $5, %%mm2 \n\t" // a2 <<= 5;
+ "movq %%mm2, %%mm3 \n\t" // a3 = a2;
+ "psllw $1, %%mm2 \n\t" // a2 <<= 1;
+ "paddsw %%mm3, %%mm0 \n\t" // am += a3;
+ "paddsw %%mm2, %%mm0 \n\t" // am += a2;
+ ::); break;
+ case 4: asm volatile("psllw $6, %%mm2 \n\t" // a2 <<= 6;
+ "psllw $1, %%mm0 \n\t" // am <<= 1;
+ "paddsw %%mm2, %%mm0 \n\t" // am += a2;
+ ::); break;
+ case 6: asm volatile("psllw $5, %%mm2 \n\t" // a2 <<= 5;
+ "movq %%mm0, %%mm3 \n\t" // a3 = am;
+ "psllw $1, %%mm0 \n\t" // am <<= 1;
+ "paddsw %%mm3, %%mm0 \n\t" // am += a3;
+ "paddsw %%mm2, %%mm0 \n\t" // am += a2;
+ ::); break;
+ case 8: asm volatile("psllw $2, %%mm0 \n\t" // am <<= 2;
+ ::); break;
+ case 10: asm volatile("psllw $5, %%mm3 \n\t" // a3 <<= 5;
+ "movq %%mm0, %%mm2 \n\t" // a2 = am;
+ "psllw $1, %%mm0 \n\t" // am <<= 1;
+ "paddsw %%mm3, %%mm0 \n\t" // am += a3;
+ "paddsw %%mm2, %%mm0 \n\t" // am += a2;
+ ::); break;
+ case 12: asm volatile("psllw $6, %%mm3 \n\t" // a3 <<= 6;
+ "psllw $1, %%mm0 \n\t" // am <<= 1;
+ "paddsw %%mm3, %%mm0 \n\t" // am += a3;
+ ::); break;
+ case 14: asm volatile("psllw $5, %%mm3 \n\t" // a3 <<= 5;
+ "movq %%mm3, %%mm2 \n\t" // a2 = a3;
+ "psllw $1, %%mm3 \n\t" // a3 <<= 1;
+ "paddsw %%mm2, %%mm0 \n\t" // am += a2;
+ "paddsw %%mm3, %%mm0 \n\t" // am += a3;
+ ::); break;
+ }
+ asm volatile("paddsw %%mm7, %%mm0 \n\t" // am += 64;
+ "psraw $7, %%mm0 \n\t" // am >>= 7;
+ "packuswb %%mm6, %%mm0 \n\t"
+ "movd %%mm0, (%0) \n\t" // tmp[x] = am;
+ ::"r"(dst));
+}
+
+static always_inline void mc_block_x(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+ int x, y;
+ assert(!(b_w&3) && !(b_h&3) && !(dx&1));
+ asm volatile("pcmpeqw %%mm7, %%mm7 \n\t"
+ "psllw $15, %%mm7 \n\t"
+ "psrlw $9, %%mm7 \n\t" // 64
+ "pxor %%mm6, %%mm6 \n\t" // 0
+ ::);
+ if (dy == 0) {
+ b_h -= 5;
+ tmp = dst;
+ src += 2*stride;
+ }
+ if (dx != 0) for(y=0; y < b_h+5; y++){
+ for(x=0; x < b_w; x += 4){
+ asm volatile("movd (%0), %%mm0 \n\t" // am = src[x ];
+ "movd 1(%0), %%mm1 \n\t" // a1 = src[x + 1];
+ "movd 2(%0), %%mm2 \n\t" // a2 = src[x + 2];
+ "movd 3(%0), %%mm3 \n\t" // a3 = src[x + 3];
+ "movd 4(%0), %%mm4 \n\t" // a4 = src[x + 4];
+ "movd 5(%0), %%mm5 \n\t" // a5 = src[x + 5];
+ ::"r"(&src[x]));
+
+ mc_block_core(dx, &tmp[x]);
+ }
+ tmp += stride;
+ src += stride;
+ }
+}
+
+static always_inline void mc_block_y(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+ int x, y;
+ if (dx == 0) tmp = (uint8_t*)src + 2;
+ if (dy != 0) for(y=0; y < b_h; y++){
+ for(x=0; x < b_w; x += 4){
+ asm volatile("movd (%0), %%mm0 \n\t" // am = tmp[x + 0*stride];
+ "movd (%1), %%mm1 \n\t" // a1 = tmp[x + 1*stride];
+ "movd (%0,%2,2), %%mm2 \n\t" // a2 = tmp[x + 2*stride];
+ "movd (%1,%2,2), %%mm3 \n\t" // a3 = tmp[x + 3*stride];
+ "movd (%0,%2,4), %%mm4 \n\t" // a4 = tmp[x + 4*stride];
+ "movd (%1,%2,4), %%mm5 \n\t" // a5 = tmp[x + 5*stride];
+ ::"r"(&tmp[x]),"r"(&tmp[x+stride]),"a"(stride));
+
+ mc_block_core(dy, &dst[x]);
+ }
+ dst += stride;
+ tmp += stride;
+ }
+ asm volatile("emms"::);
+}
+
+#define mca(a)\
+static void mc_block_x ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+ mc_block_x(dst, src, tmp, stride, b_w, b_h, a, dy);\
+}\
+static void mc_block_y ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+ mc_block_y(dst, src, tmp, stride, b_w, b_h, dx, a);\
+}
+
+mca(0)
+mca(2)
+mca(4)
+mca(6)
+mca(8)
+mca(10)
+mca(12)
+mca(14)
+
+extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
+extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
+extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+extern void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+extern void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
mm_flags = mm_support();
@@ -2950,6 +3094,35 @@
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
}
+
+ if(mm_flags & MM_SSE2){
+ c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+ c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+ }
+ else{
+ c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+ c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+ }
+
+ c->mc_block_x[0] = mc_block_x0;
+ c->mc_block_x[1] = mc_block_x2;
+ c->mc_block_x[2] = mc_block_x4;
+ c->mc_block_x[3] = mc_block_x6;
+ c->mc_block_x[4] = mc_block_x8;
+ c->mc_block_x[5] = mc_block_x10;
+ c->mc_block_x[6] = mc_block_x12;
+ c->mc_block_x[7] = mc_block_x14;
+
+ c->mc_block_y[0] = mc_block_y0;
+ c->mc_block_y[1] = mc_block_y2;
+ c->mc_block_y[2] = mc_block_y4;
+ c->mc_block_y[3] = mc_block_y6;
+ c->mc_block_y[4] = mc_block_y8;
+ c->mc_block_y[5] = mc_block_y10;
+ c->mc_block_y[6] = mc_block_y12;
+ c->mc_block_y[7] = mc_block_y14;
}
#ifdef CONFIG_ENCODERS
--- /dev/null 2006-02-17 20:18:22.000000000 +0200
+++ libavcodec/snow.h 2006-03-10 18:33:42.000000000 +0200
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni at gmx.at>
+ * Copyright (C) 2006 Robert Edele <yartrebo at earthlink.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef _SNOW_H
+#define _SNOW_H
+
+
+#define MID_STATE 128
+
+#define MAX_DECOMPOSITIONS 8
+#define MAX_PLANES 4
+#define DWTELEM int
+#define QSHIFT 5
+#define QROOT (1<<QSHIFT)
+#define LOSSLESS_QLOG -128
+#define FRAC_BITS 8
+
+/** Used to minimize the amount of memory used in order to optimize cache performance. **/
+typedef struct {
+ DWTELEM * * line; ///< For use by idwt and predict_slices.
+ DWTELEM * * data_stack; ///< Used for internal purposes.
+ int data_stack_top;
+ int line_count;
+ int line_width;
+ int data_count;
+ DWTELEM * base_buffer; ///< Buffer that this structure is caching.
+} slice_buffer;
+
+#define liftS lift
+#define lift5 lift
+#if 1
+#define W_AM 3
+#define W_AO 0
+#define W_AS 1
+
+#undef liftS
+#define W_BM 1
+#define W_BO 8
+#define W_BS 4
+
+#define W_CM 1
+#define W_CO 0
+#define W_CS 0
+
+#define W_DM 3
+#define W_DO 4
+#define W_DS 3
+#elif 0
+#define W_AM 55
+#define W_AO 16
+#define W_AS 5
+
+#define W_BM 3
+#define W_BO 32
+#define W_BS 6
+
+#define W_CM 127
+#define W_CO 64
+#define W_CS 7
+
+#define W_DM 7
+#define W_DO 8
+#define W_DS 4
+#elif 0
+#define W_AM 97
+#define W_AO 32
+#define W_AS 6
+
+#define W_BM 63
+#define W_BO 512
+#define W_BS 10
+
+#define W_CM 13
+#define W_CO 8
+#define W_CS 4
+
+#define W_DM 15
+#define W_DO 16
+#define W_DS 5
+
+#else
+
+#define W_AM 203
+#define W_AO 64
+#define W_AS 7
+
+#define W_BM 217
+#define W_BO 2048
+#define W_BS 12
+
+#define W_CM 113
+#define W_CO 64
+#define W_CS 7
+
+#define W_DM 227
+#define W_DO 128
+#define W_DS 9
+#endif
+
+extern void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+extern void ff_snow_horizontal_compose97i(DWTELEM *b, int width);
+extern void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+#endif
--- /dev/null 2006-03-09 06:42:28.643578952 -0500
+++ libavcodec/i386/snowdsp_mmx.c 2006-03-09 18:52:28.941598104 -0500
@@ -0,0 +1,1445 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo at earthlink.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Snow MMX and SSE2 optimizations by Robert Edele <yartrebo at earthlink.net>
+ */
+
+#include "../avcodec.h"
+#include "../snow.h"
+#include "mmx.h"
+
+static always_inline snow_interleave_line_header(int * i, int width, DWTELEM * low, DWTELEM * high){
+ (*i) = (width) - 2;
+
+ if (width & 1){
+ low[(*i)+1] = low[((*i)+1)>>1];
+ (*i)--;
+ }
+}
+
+static always_inline snow_horizontal_compose_lift_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w, int lift_high, int mul, int add, int shift){
+ for(; i<w; i++){
+ dst[i] = src[i] - ((mul * (ref[i] + ref[i + 1]) + add) >> shift);
+ }
+
+ if((width^lift_high)&1){
+ dst[w] = src[w] - ((mul * 2 * ref[w] + add) >> shift);
+ }
+}
+
+static always_inline snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){
+ for(; i<w; i++){
+ dst[i] = src[i] - (((-(ref[i] + ref[(i+1)])+W_BO) - 4 * src[i]) >> W_BS);
+ }
+
+ if(width&1){
+ dst[w] = src[w] - (((-2 * ref[w] + W_BO) - 4 * src[w]) >> W_BS);
+ }
+}
+
+void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
+ const int w2= (width+1)>>1;
+ // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
+ DWTELEM temp_buf[(width>>1) + 4];
+ DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) >> 2);
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ DWTELEM * const ref = b + w2 - 1;
+ DWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+ // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+ // The savings in code and time are well worth having to store this value and
+ // calculate b[0] correctly afterwards.
+
+ i = 0;
+ asm volatile(
+ "pcmpeqd %%xmm7, %%xmm7 \n\t"
+ "pslld $31, %%xmm7 \n\t"
+ "psrld $29, %%xmm7 \n\t"
+ ::);
+ for(; i<w_l-7; i+=8){
+ asm volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 4(%1), %%xmm2 \n\t"
+ "movdqu 20(%1), %%xmm6 \n\t"
+ "paddd %%xmm1, %%xmm2 \n\t"
+ "paddd %%xmm5, %%xmm6 \n\t"
+ "movdqa %%xmm2, %%xmm0 \n\t"
+ "movdqa %%xmm6, %%xmm4 \n\t"
+ "paddd %%xmm2, %%xmm2 \n\t"
+ "paddd %%xmm6, %%xmm6 \n\t"
+ "paddd %%xmm0, %%xmm2 \n\t"
+ "paddd %%xmm4, %%xmm6 \n\t"
+ "paddd %%xmm7, %%xmm2 \n\t"
+ "paddd %%xmm7, %%xmm6 \n\t"
+ "psrad $3, %%xmm2 \n\t"
+ "psrad $3, %%xmm6 \n\t"
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqa 16(%0), %%xmm4 \n\t"
+ "psubd %%xmm2, %%xmm0 \n\t"
+ "psubd %%xmm6, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ }
+
+ { // Lift 1
+ DWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
+ dst[i] = dst[i] - (b[i] + b[i + 1]);
+ }
+ for(; i<w_r-7; i+=8){
+ asm volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 4(%1), %%xmm2 \n\t"
+ "movdqu 20(%1), %%xmm6 \n\t"
+ "paddd %%xmm1, %%xmm2 \n\t"
+ "paddd %%xmm5, %%xmm6 \n\t"
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqa 16(%0), %%xmm4 \n\t"
+ "psubd %%xmm2, %%xmm0 \n\t"
+ "psubd %%xmm6, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ DWTELEM * const ref = b+w2 - 1;
+ DWTELEM b_0 = b[0];
+
+ i = 0;
+ asm volatile(
+ "pslld $1, %%xmm7 \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
+ ::);
+ for(; i<w_l-7; i+=8){
+ asm volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 4(%1), %%xmm0 \n\t"
+ "movdqu 20(%1), %%xmm4 \n\t"
+ "paddd %%xmm1, %%xmm0 \n\t"
+ "paddd %%xmm5, %%xmm4 \n\t"
+ "movdqa %%xmm7, %%xmm1 \n\t"
+ "movdqa %%xmm7, %%xmm5 \n\t"
+ "psubd %%xmm0, %%xmm1 \n\t"
+ "psubd %%xmm4, %%xmm5 \n\t"
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqa 16(%0), %%xmm4 \n\t"
+ "pslld $2, %%xmm0 \n\t"
+ "pslld $2, %%xmm4 \n\t"
+ "psubd %%xmm0, %%xmm1 \n\t"
+ "psubd %%xmm4, %%xmm5 \n\t"
+ "psrad $4, %%xmm1 \n\t"
+ "psrad $4, %%xmm5 \n\t"
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqa 16(%0), %%xmm4 \n\t"
+ "psubd %%xmm1, %%xmm0 \n\t"
+ "psubd %%xmm5, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
+ }
+
+ { // Lift 3
+ DWTELEM * const src = b+w2;
+
+ i = 0;
+ for(; (((long)&temp[i]) & 0xF) && i<w_r; i++){
+ temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+ }
+ for(; i<w_r-7; i+=8){
+ asm volatile(
+ "movdqu 4(%1), %%xmm2 \n\t"
+ "movdqu 20(%1), %%xmm6 \n\t"
+ "paddd (%1), %%xmm2 \n\t"
+ "paddd 16(%1), %%xmm6 \n\t"
+ "movdqa %%xmm2, %%xmm0 \n\t"
+ "movdqa %%xmm6, %%xmm4 \n\t"
+ "pslld $2, %%xmm2 \n\t"
+ "pslld $2, %%xmm6 \n\t"
+ "psubd %%xmm2, %%xmm0 \n\t"
+ "psubd %%xmm6, %%xmm4 \n\t"
+ "psrad $1, %%xmm0 \n\t"
+ "psrad $1, %%xmm4 \n\t"
+ "movdqu (%0), %%xmm2 \n\t"
+ "movdqu 16(%0), %%xmm6 \n\t"
+ "psubd %%xmm0, %%xmm2 \n\t"
+ "psubd %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm2, (%2) \n\t"
+ "movdqa %%xmm6, 16(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0x1E) != 0x1E; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=30; i>=0; i-=32){
+ asm volatile(
+ "movdqa (%1), %%xmm0 \n\t"
+ "movdqa 16(%1), %%xmm2 \n\t"
+ "movdqa 32(%1), %%xmm4 \n\t"
+ "movdqa 48(%1), %%xmm6 \n\t"
+ "movdqa (%1), %%xmm1 \n\t"
+ "movdqa 16(%1), %%xmm3 \n\t"
+ "movdqa 32(%1), %%xmm5 \n\t"
+ "movdqa 48(%1), %%xmm7 \n\t"
+ "punpckldq (%2), %%xmm0 \n\t"
+ "punpckldq 16(%2), %%xmm2 \n\t"
+ "punpckldq 32(%2), %%xmm4 \n\t"
+ "punpckldq 48(%2), %%xmm6 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm2, 32(%0) \n\t"
+ "movdqa %%xmm4, 64(%0) \n\t"
+ "movdqa %%xmm6, 96(%0) \n\t"
+ "punpckhdq (%2), %%xmm1 \n\t"
+ "punpckhdq 16(%2), %%xmm3 \n\t"
+ "punpckhdq 32(%2), %%xmm5 \n\t"
+ "punpckhdq 48(%2), %%xmm7 \n\t"
+ "movdqa %%xmm1, 16(%0) \n\t"
+ "movdqa %%xmm3, 48(%0) \n\t"
+ "movdqa %%xmm5, 80(%0) \n\t"
+ "movdqa %%xmm7, 112(%0) \n\t"
+ :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
+ const int w2= (width+1)>>1;
+ DWTELEM temp[width >> 1];
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ DWTELEM * const ref = b + w2 - 1;
+
+ i = 1;
+ b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ asm volatile(
+ "pcmpeqd %%mm7, %%mm7 \n\t"
+ "pslld $31, %%mm7 \n\t"
+ "psrld $29, %%mm7 \n\t"
+ ::);
+ for(; i<w_l-3; i+=4){
+ asm volatile(
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm5 \n\t"
+ "movq 4(%1), %%mm2 \n\t"
+ "movq 12(%1), %%mm6 \n\t"
+ "paddd %%mm1, %%mm2 \n\t"
+ "paddd %%mm5, %%mm6 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "movq %%mm6, %%mm4 \n\t"
+ "paddd %%mm2, %%mm2 \n\t"
+ "paddd %%mm6, %%mm6 \n\t"
+ "paddd %%mm0, %%mm2 \n\t"
+ "paddd %%mm4, %%mm6 \n\t"
+ "paddd %%mm7, %%mm2 \n\t"
+ "paddd %%mm7, %%mm6 \n\t"
+ "psrad $3, %%mm2 \n\t"
+ "psrad $3, %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "psubd %%mm2, %%mm0 \n\t"
+ "psubd %%mm6, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ }
+
+ { // Lift 1
+ DWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; i<w_r-3; i+=4){
+ asm volatile(
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm5 \n\t"
+ "movq 4(%1), %%mm2 \n\t"
+ "movq 12(%1), %%mm6 \n\t"
+ "paddd %%mm1, %%mm2 \n\t"
+ "paddd %%mm5, %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "psubd %%mm2, %%mm0 \n\t"
+ "psubd %%mm6, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ DWTELEM * const ref = b+w2 - 1;
+
+ i = 1;
+ b[0] = b[0] - (((-2 * ref[1] + W_BO) - 4 * b[0]) >> W_BS);
+ asm volatile(
+ "pslld $1, %%mm7 \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
+ ::);
+ for(; i<w_l-3; i+=4){
+ asm volatile(
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm5 \n\t"
+ "movq 4(%1), %%mm0 \n\t"
+ "movq 12(%1), %%mm4 \n\t"
+ "paddd %%mm1, %%mm0 \n\t"
+ "paddd %%mm5, %%mm4 \n\t"
+ "movq %%mm7, %%mm1 \n\t"
+ "movq %%mm7, %%mm5 \n\t"
+ "psubd %%mm0, %%mm1 \n\t"
+ "psubd %%mm4, %%mm5 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "pslld $2, %%mm0 \n\t"
+ "pslld $2, %%mm4 \n\t"
+ "psubd %%mm0, %%mm1 \n\t"
+ "psubd %%mm4, %%mm5 \n\t"
+ "psrad $4, %%mm1 \n\t"
+ "psrad $4, %%mm5 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "psubd %%mm1, %%mm0 \n\t"
+ "psubd %%mm5, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ }
+
+ { // Lift 3
+ DWTELEM * const src = b+w2;
+ i = 0;
+
+ for(; i<w_r-3; i+=4){
+ asm volatile(
+ "movq 4(%1), %%mm2 \n\t"
+ "movq 12(%1), %%mm6 \n\t"
+ "paddd (%1), %%mm2 \n\t"
+ "paddd 8(%1), %%mm6 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "movq %%mm6, %%mm4 \n\t"
+ "pslld $2, %%mm2 \n\t"
+ "pslld $2, %%mm6 \n\t"
+ "psubd %%mm2, %%mm0 \n\t"
+ "psubd %%mm6, %%mm4 \n\t"
+ "psrad $1, %%mm0 \n\t"
+ "psrad $1, %%mm4 \n\t"
+ "movq (%0), %%mm2 \n\t"
+ "movq 8(%0), %%mm6 \n\t"
+ "psubd %%mm0, %%mm2 \n\t"
+ "psubd %%mm4, %%mm6 \n\t"
+ "movq %%mm2, (%2) \n\t"
+ "movq %%mm6, 8(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0xE) != 0xE; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=14; i>=0; i-=16){
+ asm volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm6 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm3 \n\t"
+ "movq 16(%1), %%mm5 \n\t"
+ "movq 24(%1), %%mm7 \n\t"
+ "punpckldq (%2), %%mm0 \n\t"
+ "punpckldq 8(%2), %%mm2 \n\t"
+ "punpckldq 16(%2), %%mm4 \n\t"
+ "punpckldq 24(%2), %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, 16(%0) \n\t"
+ "movq %%mm4, 32(%0) \n\t"
+ "movq %%mm6, 48(%0) \n\t"
+ "punpckhdq (%2), %%mm1 \n\t"
+ "punpckhdq 8(%2), %%mm3 \n\t"
+ "punpckhdq 16(%2), %%mm5 \n\t"
+ "punpckhdq 24(%2), %%mm7 \n\t"
+ "movq %%mm1, 8(%0) \n\t"
+ "movq %%mm3, 24(%0) \n\t"
+ "movq %%mm5, 40(%0) \n\t"
+ "movq %%mm7, 56(%0) \n\t"
+ :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
+ long i = width;
+
+ while(i & 0xF)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+
+ asm volatile (
+ "mov %6, %%"REG_d" \n\t"
+ "jmp 2f \n\t"
+ "1: \n\t"
+
+ "mov %5, %%"REG_a" \n\t"
+ "mov %3, %%"REG_b" \n\t"
+
+ "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0 \n\t"
+ "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2 \n\t"
+ "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4 \n\t"
+ "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6 \n\t"
+
+ "paddd (%%"REG_a",%%"REG_d",4), %%xmm0 \n\t"
+ "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2 \n\t"
+ "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4 \n\t"
+ "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6 \n\t"
+
+ "movdqa %%xmm0, %%xmm1 \n\t"
+ "movdqa %%xmm2, %%xmm3 \n\t"
+ "movdqa %%xmm4, %%xmm5 \n\t"
+ "movdqa %%xmm6, %%xmm7 \n\t"
+
+ "pslld $1, %%xmm0 \n\t"
+ "pslld $1, %%xmm2 \n\t"
+ "pslld $1, %%xmm4 \n\t"
+ "pslld $1, %%xmm6 \n\t"
+
+ "paddd %%xmm1, %%xmm0 \n\t"
+ "paddd %%xmm3, %%xmm2 \n\t"
+ "paddd %%xmm5, %%xmm4 \n\t"
+ "paddd %%xmm7, %%xmm6 \n\t"
+
+ "pcmpeqd %%xmm1, %%xmm1 \n\t"
+ "pslld $31, %%xmm1 \n\t"
+ "psrld $29, %%xmm1 \n\t"
+ "mov %4, %%"REG_a" \n\t"
+
+ "paddd %%xmm1, %%xmm0 \n\t"
+ "paddd %%xmm1, %%xmm2 \n\t"
+ "paddd %%xmm1, %%xmm4 \n\t"
+ "paddd %%xmm1, %%xmm6 \n\t"
+
+ "psrad $3, %%xmm0 \n\t"
+ "psrad $3, %%xmm2 \n\t"
+ "psrad $3, %%xmm4 \n\t"
+ "psrad $3, %%xmm6 \n\t"
+
+ "movdqa (%%"REG_a",%%"REG_d",4), %%xmm1 \n\t"
+ "movdqa 16(%%"REG_a",%%"REG_d",4), %%xmm3 \n\t"
+ "movdqa 32(%%"REG_a",%%"REG_d",4), %%xmm5 \n\t"
+ "movdqa 48(%%"REG_a",%%"REG_d",4), %%xmm7 \n\t"
+
+ "psubd %%xmm0, %%xmm1 \n\t"
+ "psubd %%xmm2, %%xmm3 \n\t"
+ "psubd %%xmm4, %%xmm5 \n\t"
+ "psubd %%xmm6, %%xmm7 \n\t"
+
+ "movdqa %%xmm1, (%%"REG_a",%%"REG_d",4) \n\t"
+ "movdqa %%xmm3, 16(%%"REG_a",%%"REG_d",4) \n\t"
+ "movdqa %%xmm5, 32(%%"REG_a",%%"REG_d",4) \n\t"
+ "movdqa %%xmm7, 48(%%"REG_a",%%"REG_d",4) \n\t"
+
+ "mov %2, %%"REG_c" \n\t"
+
+ "paddd (%%"REG_c",%%"REG_d",4), %%xmm1 \n\t"
+ "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm3 \n\t"
+ "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm5 \n\t"
+ "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm7 \n\t"
+
+ "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0 \n\t"
+ "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2 \n\t"
+ "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4 \n\t"
+ "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6 \n\t"
+
+ "psubd %%xmm1, %%xmm0 \n\t"
+ "psubd %%xmm3, %%xmm2 \n\t"
+ "psubd %%xmm5, %%xmm4 \n\t"
+ "psubd %%xmm7, %%xmm6 \n\t"
+
+ "movdqa %%xmm0, (%%"REG_b",%%"REG_d",4) \n\t"
+ "movdqa %%xmm2, 16(%%"REG_b",%%"REG_d",4) \n\t"
+ "movdqa %%xmm4, 32(%%"REG_b",%%"REG_d",4) \n\t"
+ "movdqa %%xmm6, 48(%%"REG_b",%%"REG_d",4) \n\t"
+
+ "mov %1, %%"REG_a" \n\t"
+
+ "paddd (%%"REG_a",%%"REG_d",4), %%xmm0 \n\t"
+ "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2 \n\t"
+ "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4 \n\t"
+ "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6 \n\t"
+
+ "movdqa (%%"REG_c",%%"REG_d",4), %%xmm1 \n\t"
+ "movdqa 16(%%"REG_c",%%"REG_d",4), %%xmm3 \n\t"
+ "movdqa 32(%%"REG_c",%%"REG_d",4), %%xmm5 \n\t"
+ "movdqa 48(%%"REG_c",%%"REG_d",4), %%xmm7 \n\t"
+
+ "pslld $2, %%xmm1 \n\t"
+ "pslld $2, %%xmm3 \n\t"
+ "pslld $2, %%xmm5 \n\t"
+ "pslld $2, %%xmm7 \n\t"
+
+ "paddd %%xmm1, %%xmm0 \n\t"
+ "paddd %%xmm3, %%xmm2 \n\t"
+ "paddd %%xmm5, %%xmm4 \n\t"
+ "paddd %%xmm7, %%xmm6 \n\t"
+
+ "pcmpeqd %%xmm1, %%xmm1 \n\t"
+ "pslld $31, %%xmm1 \n\t"
+ "psrld $28, %%xmm1 \n\t"
+ "mov %0, %%"REG_b" \n\t"
+
+ "paddd %%xmm1, %%xmm0 \n\t"
+ "paddd %%xmm1, %%xmm2 \n\t"
+ "paddd %%xmm1, %%xmm4 \n\t"
+ "paddd %%xmm1, %%xmm6 \n\t"
+
+ "psrad $4, %%xmm0 \n\t"
+ "psrad $4, %%xmm2 \n\t"
+ "psrad $4, %%xmm4 \n\t"
+ "psrad $4, %%xmm6 \n\t"
+
+ "paddd (%%"REG_c",%%"REG_d",4), %%xmm0 \n\t"
+ "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm2 \n\t"
+ "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm4 \n\t"
+ "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm6 \n\t"
+
+ "movdqa %%xmm0, (%%"REG_c",%%"REG_d",4) \n\t"
+ "movdqa %%xmm2, 16(%%"REG_c",%%"REG_d",4) \n\t"
+ "movdqa %%xmm4, 32(%%"REG_c",%%"REG_d",4) \n\t"
+ "movdqa %%xmm6, 48(%%"REG_c",%%"REG_d",4) \n\t"
+
+ "paddd (%%"REG_b",%%"REG_d",4), %%xmm0 \n\t"
+ "paddd 16(%%"REG_b",%%"REG_d",4), %%xmm2 \n\t"
+ "paddd 32(%%"REG_b",%%"REG_d",4), %%xmm4 \n\t"
+ "paddd 48(%%"REG_b",%%"REG_d",4), %%xmm6 \n\t"
+
+ "movdqa %%xmm0, %%xmm1 \n\t"
+ "movdqa %%xmm2, %%xmm3 \n\t"
+ "movdqa %%xmm4, %%xmm5 \n\t"
+ "movdqa %%xmm6, %%xmm7 \n\t"
+
+ "pslld $1, %%xmm0 \n\t"
+ "pslld $1, %%xmm2 \n\t"
+ "pslld $1, %%xmm4 \n\t"
+ "pslld $1, %%xmm6 \n\t"
+
+ "paddd %%xmm1, %%xmm0 \n\t"
+ "paddd %%xmm3, %%xmm2 \n\t"
+ "paddd %%xmm5, %%xmm4 \n\t"
+ "paddd %%xmm7, %%xmm6 \n\t"
+
+ "psrad $1, %%xmm0 \n\t"
+ "psrad $1, %%xmm2 \n\t"
+ "psrad $1, %%xmm4 \n\t"
+ "psrad $1, %%xmm6 \n\t"
+
+ "paddd (%%"REG_a",%%"REG_d",4), %%xmm0 \n\t"
+ "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2 \n\t"
+ "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4 \n\t"
+ "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6 \n\t"
+
+ "movdqa %%xmm0, (%%"REG_a",%%"REG_d",4) \n\t"
+ "movdqa %%xmm2, 16(%%"REG_a",%%"REG_d",4) \n\t"
+ "movdqa %%xmm4, 32(%%"REG_a",%%"REG_d",4) \n\t"
+ "movdqa %%xmm6, 48(%%"REG_a",%%"REG_d",4) \n\t"
+
+ "2: \n\t"
+ "sub $16, %%"REG_d" \n\t"
+ "jge 1b \n\t"
+ ::
+ "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5),"rm"(i):
+ "%"REG_a"","%"REG_b"","%"REG_c"", "%"REG_d"");
+}
+
+void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
+ long i = width;
+ while(i & 0x7)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+
+ asm volatile(
+ "mov %6, %%"REG_d" \n\t"
+ "jmp 2f \n\t"
+ "1: \n\t"
+
+ "mov %5, %%"REG_a" \n\t"
+ "mov %3, %%"REG_b" \n\t"
+
+ "movq (%%"REG_b",%%"REG_d",4), %%mm0 \n\t"
+ "movq 8(%%"REG_b",%%"REG_d",4), %%mm2 \n\t"
+ "movq 16(%%"REG_b",%%"REG_d",4), %%mm4 \n\t"
+ "movq 24(%%"REG_b",%%"REG_d",4), %%mm6 \n\t"
+
+ "paddd (%%"REG_a",%%"REG_d",4), %%mm0 \n\t"
+ "paddd 8(%%"REG_a",%%"REG_d",4), %%mm2 \n\t"
+ "paddd 16(%%"REG_a",%%"REG_d",4), %%mm4 \n\t"
+ "paddd 24(%%"REG_a",%%"REG_d",4), %%mm6 \n\t"
+
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "movq %%mm6, %%mm7 \n\t"
+
+ "pslld $1, %%mm0 \n\t"
+ "pslld $1, %%mm2 \n\t"
+ "pslld $1, %%mm4 \n\t"
+ "pslld $1, %%mm6 \n\t"
+
+ "paddd %%mm1, %%mm0 \n\t"
+ "paddd %%mm3, %%mm2 \n\t"
+ "paddd %%mm5, %%mm4 \n\t"
+ "paddd %%mm7, %%mm6 \n\t"
+
+ "pcmpeqd %%mm1, %%mm1 \n\t"
+ "pslld $31, %%mm1 \n\t"
+ "psrld $29, %%mm1 \n\t"
+ "mov %4, %%"REG_a" \n\t"
+
+ "paddd %%mm1, %%mm0 \n\t"
+ "paddd %%mm1, %%mm2 \n\t"
+ "paddd %%mm1, %%mm4 \n\t"
+ "paddd %%mm1, %%mm6 \n\t"
+
+ "psrad $3, %%mm0 \n\t"
+ "psrad $3, %%mm2 \n\t"
+ "psrad $3, %%mm4 \n\t"
+ "psrad $3, %%mm6 \n\t"
+
+ "movq (%%"REG_a",%%"REG_d",4), %%mm1 \n\t"
+ "movq 8(%%"REG_a",%%"REG_d",4), %%mm3 \n\t"
+ "movq 16(%%"REG_a",%%"REG_d",4), %%mm5 \n\t"
+ "movq 24(%%"REG_a",%%"REG_d",4), %%mm7 \n\t"
+
+ "psubd %%mm0, %%mm1 \n\t"
+ "psubd %%mm2, %%mm3 \n\t"
+ "psubd %%mm4, %%mm5 \n\t"
+ "psubd %%mm6, %%mm7 \n\t"
+
+ "movq %%mm1, (%%"REG_a",%%"REG_d",4) \n\t"
+ "movq %%mm3, 8(%%"REG_a",%%"REG_d",4) \n\t"
+ "movq %%mm5, 16(%%"REG_a",%%"REG_d",4) \n\t"
+ "movq %%mm7, 24(%%"REG_a",%%"REG_d",4) \n\t"
+
+ "mov %2, %%"REG_c" \n\t"
+
+ "paddd (%%"REG_c",%%"REG_d",4), %%mm1 \n\t"
+ "paddd 8(%%"REG_c",%%"REG_d",4), %%mm3 \n\t"
+ "paddd 16(%%"REG_c",%%"REG_d",4), %%mm5 \n\t"
+ "paddd 24(%%"REG_c",%%"REG_d",4), %%mm7 \n\t"
+
+ "movq (%%"REG_b",%%"REG_d",4), %%mm0 \n\t"
+ "movq 8(%%"REG_b",%%"REG_d",4), %%mm2 \n\t"
+ "movq 16(%%"REG_b",%%"REG_d",4), %%mm4 \n\t"
+ "movq 24(%%"REG_b",%%"REG_d",4), %%mm6 \n\t"
+
+ "psubd %%mm1, %%mm0 \n\t"
+ "psubd %%mm3, %%mm2 \n\t"
+ "psubd %%mm5, %%mm4 \n\t"
+ "psubd %%mm7, %%mm6 \n\t"
+
+ "movq %%mm0, (%%"REG_b",%%"REG_d",4) \n\t"
+ "movq %%mm2, 8(%%"REG_b",%%"REG_d",4) \n\t"
+ "movq %%mm4, 16(%%"REG_b",%%"REG_d",4) \n\t"
+ "movq %%mm6, 24(%%"REG_b",%%"REG_d",4) \n\t"
+
+ "mov %1, %%"REG_a" \n\t"
+
+ "paddd (%%"REG_a",%%"REG_d",4), %%mm0 \n\t"
+ "paddd 8(%%"REG_a",%%"REG_d",4), %%mm2 \n\t"
+ "paddd 16(%%"REG_a",%%"REG_d",4), %%mm4 \n\t"
+ "paddd 24(%%"REG_a",%%"REG_d",4), %%mm6 \n\t"
+
+ "movq (%%"REG_c",%%"REG_d",4), %%mm1 \n\t"
+ "movq 8(%%"REG_c",%%"REG_d",4), %%mm3 \n\t"
+ "movq 16(%%"REG_c",%%"REG_d",4), %%mm5 \n\t"
+ "movq 24(%%"REG_c",%%"REG_d",4), %%mm7 \n\t"
+
+ "pslld $2, %%mm1 \n\t"
+ "pslld $2, %%mm3 \n\t"
+ "pslld $2, %%mm5 \n\t"
+ "pslld $2, %%mm7 \n\t"
+
+ "paddd %%mm1, %%mm0 \n\t"
+ "paddd %%mm3, %%mm2 \n\t"
+ "paddd %%mm5, %%mm4 \n\t"
+ "paddd %%mm7, %%mm6 \n\t"
+
+ "pcmpeqd %%mm1, %%mm1 \n\t"
+ "pslld $31, %%mm1 \n\t"
+ "psrld $28, %%mm1 \n\t"
+ "mov %0, %%"REG_b" \n\t"
+
+ "paddd %%mm1, %%mm0 \n\t"
+ "paddd %%mm1, %%mm2 \n\t"
+ "paddd %%mm1, %%mm4 \n\t"
+ "paddd %%mm1, %%mm6 \n\t"
+
+ "psrad $4, %%mm0 \n\t"
+ "psrad $4, %%mm2 \n\t"
+ "psrad $4, %%mm4 \n\t"
+ "psrad $4, %%mm6 \n\t"
+
+ "paddd (%%"REG_c",%%"REG_d",4), %%mm0 \n\t"
+ "paddd 8(%%"REG_c",%%"REG_d",4), %%mm2 \n\t"
+ "paddd 16(%%"REG_c",%%"REG_d",4), %%mm4 \n\t"
+ "paddd 24(%%"REG_c",%%"REG_d",4), %%mm6 \n\t"
+
+ "movq %%mm0, (%%"REG_c",%%"REG_d",4) \n\t"
+ "movq %%mm2, 8(%%"REG_c",%%"REG_d",4) \n\t"
+ "movq %%mm4, 16(%%"REG_c",%%"REG_d",4) \n\t"
+ "movq %%mm6, 24(%%"REG_c",%%"REG_d",4) \n\t"
+
+ "paddd (%%"REG_b",%%"REG_d",4), %%mm0 \n\t"
+ "paddd 8(%%"REG_b",%%"REG_d",4), %%mm2 \n\t"
+ "paddd 16(%%"REG_b",%%"REG_d",4), %%mm4 \n\t"
+ "paddd 24(%%"REG_b",%%"REG_d",4), %%mm6 \n\t"
+
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "movq %%mm6, %%mm7 \n\t"
+
+ "pslld $1, %%mm0 \n\t"
+ "pslld $1, %%mm2 \n\t"
+ "pslld $1, %%mm4 \n\t"
+ "pslld $1, %%mm6 \n\t"
+
+ "paddd %%mm1, %%mm0 \n\t"
+ "paddd %%mm3, %%mm2 \n\t"
+ "paddd %%mm5, %%mm4 \n\t"
+ "paddd %%mm7, %%mm6 \n\t"
+
+ "psrad $1, %%mm0 \n\t"
+ "psrad $1, %%mm2 \n\t"
+ "psrad $1, %%mm4 \n\t"
+ "psrad $1, %%mm6 \n\t"
+
+ "paddd (%%"REG_a",%%"REG_d",4), %%mm0 \n\t"
+ "paddd 8(%%"REG_a",%%"REG_d",4), %%mm2 \n\t"
+ "paddd 16(%%"REG_a",%%"REG_d",4), %%mm4 \n\t"
+ "paddd 24(%%"REG_a",%%"REG_d",4), %%mm6 \n\t"
+
+ "movq %%mm0, (%%"REG_a",%%"REG_d",4) \n\t"
+ "movq %%mm2, 8(%%"REG_a",%%"REG_d",4) \n\t"
+ "movq %%mm4, 16(%%"REG_a",%%"REG_d",4) \n\t"
+ "movq %%mm6, 24(%%"REG_a",%%"REG_d",4) \n\t"
+
+ "2: \n\t"
+ "sub $8, %%"REG_d" \n\t"
+ "jge 1b \n\t"
+ ::
+ "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5),"rm"(i):
+ "%"REG_a"","%"REG_b"","%"REG_c"", "%"REG_d"");
+}
+
+
+
+static inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+ int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ DWTELEM * * dst_array = sb->line + src_y;
+
+ asm volatile(
+ "mov %6, %%"REG_c" \n\t"
+ "mov %5, %%"REG_b" \n\t"
+ "mov %3, %%"REG_S" \n\t"
+ "pcmpeqd %%xmm4, %%xmm4 \n\t"
+ "pslld $31, %%xmm4 \n\t"
+ "pxor %%xmm7, %%xmm7 \n\t" /* 0 */
+ "psrld $24, %%xmm4 \n\t" /* FRAC_BITS >> 1 */
+
+ "1: \n\t"
+ "movq (%%"REG_S"), %%xmm0 \n\t"
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "movq 8(%%"REG_S"), %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "movq (%%"REG_d"), %%xmm5 \n\t"
+ "mov %1, %%"REG_D" \n\t"
+ "punpcklbw %%xmm7, %%xmm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movq (%%"REG_d"), %%xmm6 \n\t"
+ "pmullw %%xmm0, %%xmm5 \n\t"
+ "punpcklbw %%xmm7, %%xmm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+ "mov (%%"REG_D"), %%"REG_D" \n\t"
+
+ "movq 128(%%"REG_S"), %%xmm0 \n\t"
+ "pmullw %%xmm1, %%xmm6 \n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "movq 136(%%"REG_S"), %%xmm1 \n\t"
+ "add %2, %%"REG_D" \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "movq (%%"REG_d"), %%xmm2 \n\t"
+ "punpcklbw %%xmm7, %%xmm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%xmm5, %%xmm6 \n\t"
+ "pmullw %%xmm0, %%xmm2 \n\t"
+ "movq (%%"REG_d"), %%xmm3 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+ "punpcklbw %%xmm7, %%xmm3 \n\t"
+ "paddusw %%xmm2, %%xmm6 \n\t"
+ "pmullw %%xmm1, %%xmm3 \n\t"
+ "paddusw %%xmm3, %%xmm6 \n\t"
+
+ "movdqa (%%"REG_D"), %%xmm3 \n\t"
+ "movdqa %%xmm6, %%xmm0 \n\t"
+ "movdqa 16(%%"REG_D"), %%xmm5 \n\t"
+ "punpckhwd %%xmm7, %%xmm6 \n\t"
+ "movq 24(%%"REG_S"), %%xmm1 \n\t"
+ "punpcklwd %%xmm7, %%xmm0 \n\t"
+ "paddd %%xmm0, %%xmm3 \n\t"
+ "paddd %%xmm6, %%xmm5 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "paddd %%xmm4, %%xmm3 \n\t"
+ "paddd %%xmm4, %%xmm5 \n\t"
+ "movq 16(%%"REG_S"), %%xmm0 \n\t"
+ "psrad $8, %%xmm3 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
+
+ "packssdw %%xmm5, %%xmm3 \n\t"
+ "mov %1, %%"REG_D" \n\t"
+ "packuswb %%xmm7, %%xmm3 \n\t"
+
+ "movq %%xmm3, (%%"REG_d") \n\t"
+
+
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "movq (%%"REG_d",%%"REG_c"), %%xmm5; \n\t"
+ "punpcklbw %%xmm7, %%xmm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movq (%%"REG_d",%%"REG_c"), %%xmm6; \n\t"
+ "pmullw %%xmm0, %%xmm5 \n\t"
+ "punpcklbw %%xmm7, %%xmm6 \n\t"
+
+ "movq 144(%%"REG_S"), %%xmm0 \n\t"
+ "pmullw %%xmm1, %%xmm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "movq 152(%%"REG_S"), %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "movq (%%"REG_d",%%"REG_c"), %%xmm2;\n\t"
+ "punpcklbw %%xmm7, %%xmm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%xmm5, %%xmm6 \n\t"
+ "pmullw %%xmm0, %%xmm2 \n\t"
+ "movq (%%"REG_d",%%"REG_c"), %%xmm3;\n\t"
+ "punpcklbw %%xmm7, %%xmm3 \n\t"
+ "paddusw %%xmm2, %%xmm6 \n\t"
+ "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
+ "pmullw %%xmm1, %%xmm3 \n\t"
+ "sal $1, %%"REG_c" \n\t"
+ "add %2, %%"REG_D" \n\t"
+ "paddusw %%xmm3, %%xmm6 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+
+ "movdqa (%%"REG_D"), %%xmm3 \n\t"
+ "movdqa %%xmm6, %%xmm0 \n\t"
+ "movdqa 16(%%"REG_D"), %%xmm5 \n\t"
+ "punpckhwd %%xmm7, %%xmm6 \n\t"
+ "punpcklwd %%xmm7, %%xmm0 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
+ "paddd %%xmm0, %%xmm3 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
+ "paddd %%xmm6, %%xmm5 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
+ "paddd %%xmm4, %%xmm3 \n\t"
+ "add %%"REG_c", (%%"REG_a") \n\t"
+ "paddd %%xmm4, %%xmm5 \n\t"
+ "psrad $8, %%xmm3 \n\t" /* FRAC_BITS. */
+ "add $"PTR_SIZE"*2, %1 \n\t"
+ "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
+ "add $32, %%"REG_S" \n\t"
+
+ "packssdw %%xmm5, %%xmm3 \n\t"
+ "add %%"REG_c", %0 \n\t"
+ "packuswb %%xmm7, %%xmm3 \n\t"
+
+ "sar $1, %%"REG_c" \n\t"
+ "movq %%xmm3, (%%"REG_d",%%"REG_c");\n\t"
+
+ "sub $2, %%"REG_b" \n\t"
+ "jnz 1b \n\t"
+ :
+ :
+ "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):
+ "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+}
+
+static inner_add_yblock_bw_16_obmc_32_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+ int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ DWTELEM * * dst_array = sb->line + src_y;
+
+ asm volatile(
+ "mov %6, %%"REG_c" \n\t"
+ "mov %5, %%"REG_b" \n\t"
+ "mov %3, %%"REG_S" \n\t"
+ "pcmpeqd %%xmm4, %%xmm4 \n\t"
+ "pslld $31, %%xmm4 \n\t"
+ "pxor %%xmm7, %%xmm7 \n\t" /* 0 */
+ "psrld $24, %%xmm4 \n\t" /* FRAC_BITS >> 1 */
+
+ "1: \n\t"
+ "movq (%%"REG_S"), %%xmm0 \n\t"
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "movq 16(%%"REG_S"), %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "movq (%%"REG_d"), %%xmm5 \n\t"
+ "mov %1, %%"REG_D" \n\t"
+ "punpcklbw %%xmm7, %%xmm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movq (%%"REG_d"), %%xmm6 \n\t"
+ "pmullw %%xmm0, %%xmm5 \n\t"
+ "punpcklbw %%xmm7, %%xmm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+
+ "movq 512(%%"REG_S"), %%xmm0 \n\t"
+ "pmullw %%xmm1, %%xmm6 \n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "movq 528(%%"REG_S"), %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "movq (%%"REG_d"), %%xmm2 \n\t"
+ "punpcklbw %%xmm7, %%xmm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%xmm5, %%xmm6 \n\t"
+ "mov (%%"REG_D"), %%"REG_D" \n\t"
+ "pmullw %%xmm0, %%xmm2 \n\t"
+ "movq (%%"REG_d"), %%xmm3 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+ "punpcklbw %%xmm7, %%xmm3 \n\t"
+ "add %2, %%"REG_D" \n\t"
+ "paddusw %%xmm2, %%xmm6 \n\t"
+ "pmullw %%xmm1, %%xmm3 \n\t"
+ "paddusw %%xmm3, %%xmm6 \n\t"
+
+ "movdqa (%%"REG_D"), %%xmm3 \n\t"
+ "movdqa %%xmm6, %%xmm0 \n\t"
+ "movdqa 16(%%"REG_D"), %%xmm5 \n\t"
+ "punpckhwd %%xmm7, %%xmm6 \n\t"
+ "movq 24(%%"REG_S"), %%xmm1 \n\t"
+ "punpcklwd %%xmm7, %%xmm0 \n\t"
+ "paddd %%xmm0, %%xmm3 \n\t"
+ "paddd %%xmm6, %%xmm5 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "paddd %%xmm4, %%xmm3 \n\t"
+ "paddd %%xmm4, %%xmm5 \n\t"
+ "movq 8(%%"REG_S"), %%xmm0 \n\t"
+ "psrad $8, %%xmm3 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
+
+ "packssdw %%xmm5, %%xmm3 \n\t"
+ "packuswb %%xmm7, %%xmm3 \n\t"
+
+ "movq %%xmm3, (%%"REG_d") \n\t"
+
+
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "movq 8(%%"REG_d"), %%xmm5 \n\t"
+ "punpcklbw %%xmm7, %%xmm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movq 8(%%"REG_d"), %%xmm6 \n\t"
+ "pmullw %%xmm0, %%xmm5 \n\t"
+ "punpcklbw %%xmm7, %%xmm6 \n\t"
+
+ "movq 520(%%"REG_S"), %%xmm0 \n\t"
+ "pmullw %%xmm1, %%xmm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "movq 536(%%"REG_S"), %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "movq 8(%%"REG_d"), %%xmm2 \n\t"
+ "punpcklbw %%xmm7, %%xmm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%xmm5, %%xmm6 \n\t"
+ "pmullw %%xmm0, %%xmm2 \n\t"
+ "movq 8(%%"REG_d"), %%xmm3 \n\t"
+ "punpcklbw %%xmm7, %%xmm3 \n\t"
+ "paddusw %%xmm2, %%xmm6 \n\t"
+ "pmullw %%xmm1, %%xmm3 \n\t"
+ "paddusw %%xmm3, %%xmm6 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+
+ "movdqa 32(%%"REG_D"), %%xmm3 \n\t"
+ "movdqa %%xmm6, %%xmm0 \n\t"
+ "movdqa 48(%%"REG_D"), %%xmm5 \n\t"
+ "punpckhwd %%xmm7, %%xmm6 \n\t"
+ "punpcklwd %%xmm7, %%xmm0 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
+ "paddd %%xmm0, %%xmm3 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
+ "paddd %%xmm6, %%xmm5 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
+ "paddd %%xmm4, %%xmm3 \n\t"
+ "add %%"REG_c", (%%"REG_a") \n\t"
+ "paddd %%xmm4, %%xmm5 \n\t"
+ "psrad $8, %%xmm3 \n\t" /* FRAC_BITS. */
+ "add $"PTR_SIZE"*1, %1 \n\t"
+ "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
+ "add $32, %%"REG_S" \n\t"
+
+ "packssdw %%xmm5, %%xmm3 \n\t"
+ "add %%"REG_c", %0 \n\t"
+ "packuswb %%xmm7, %%xmm3 \n\t"
+
+ "movq %%xmm3, 8(%%"REG_d") \n\t"
+
+ "dec %%"REG_b" \n\t"
+ "jnz 1b \n\t"
+ :
+ :
+ "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):
+ "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+}
+
+static inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+ int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ DWTELEM * * dst_array = sb->line + src_y;
+
+ asm volatile(
+ "mov %6, %%"REG_c" \n\t"
+ "mov %5, %%"REG_b" \n\t"
+ "mov %3, %%"REG_S" \n\t"
+ "pcmpeqd %%mm4, %%mm4 \n\t"
+ "pslld $31, %%mm4 \n\t"
+ "pxor %%mm7, %%mm7 \n\t" /* 0 */
+ "psrld $24, %%mm4 \n\t" /* FRAC_BITS >> 1 */
+
+ "1: \n\t"
+ "movd (%%"REG_S"), %%mm0 \n\t"
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 8(%%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd (%%"REG_d"), %%mm5 \n\t"
+ "mov %1, %%"REG_D" \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movd (%%"REG_d"), %%mm6 \n\t"
+ "pmullw %%mm0, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+
+ "movd 128(%%"REG_S"), %%mm0 \n\t"
+ "pmullw %%mm1, %%mm6 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 136(%%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd (%%"REG_d"), %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%mm5, %%mm6 \n\t"
+ "mov (%%"REG_D"), %%"REG_D" \n\t"
+ "pmullw %%mm0, %%mm2 \n\t"
+ "movd (%%"REG_d"), %%mm3 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "add %2, %%"REG_D" \n\t"
+ "paddusw %%mm2, %%mm6 \n\t"
+ "pmullw %%mm1, %%mm3 \n\t"
+ "paddusw %%mm3, %%mm6 \n\t"
+
+ "movq (%%"REG_D"), %%mm3 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "movq 8(%%"REG_D"), %%mm5 \n\t"
+ "punpckhwd %%mm7, %%mm6 \n\t"
+ "movd 12(%%"REG_S"), %%mm1 \n\t"
+ "punpcklwd %%mm7, %%mm0 \n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "paddd %%mm6, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "paddd %%mm4, %%mm3 \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "movd 4(%%"REG_S"), %%mm0 \n\t"
+ "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
+
+ "packssdw %%mm5, %%mm3 \n\t"
+ "packuswb %%mm7, %%mm3 \n\t"
+
+ "movd %%mm3, (%%"REG_d") \n\t"
+
+
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 4(%%"REG_d"), %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movd 4(%%"REG_d"), %%mm6 \n\t"
+ "pmullw %%mm0, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+
+ "movd 132(%%"REG_S"), %%mm0 \n\t"
+ "pmullw %%mm1, %%mm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 140(%%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd 4(%%"REG_d"), %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%mm5, %%mm6 \n\t"
+ "pmullw %%mm0, %%mm2 \n\t"
+ "movd 4(%%"REG_d"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm6 \n\t"
+ "pmullw %%mm1, %%mm3 \n\t"
+ "paddusw %%mm3, %%mm6 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+
+ "movq 16(%%"REG_D"), %%mm3 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "movq 24(%%"REG_D"), %%mm5 \n\t"
+ "punpckhwd %%mm7, %%mm6 \n\t"
+ "punpcklwd %%mm7, %%mm0 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
+ "paddd %%mm6, %%mm5 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
+ "paddd %%mm4, %%mm3 \n\t"
+ "add %%"REG_c", (%%"REG_a") \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
+ "add $"PTR_SIZE"*1, %1 \n\t"
+ "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
+ "add $16, %%"REG_S" \n\t"
+
+ "packssdw %%mm5, %%mm3 \n\t"
+ "add %%"REG_c", %0 \n\t"
+ "packuswb %%mm7, %%mm3 \n\t"
+
+ "movd %%mm3, 4(%%"REG_d") \n\t"
+
+ "dec %%"REG_b" \n\t"
+ "jnz 1b \n\t"
+ :
+ :
+ "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):
+ "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+ int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ DWTELEM * * dst_array = sb->line + src_y;
+
+ asm volatile(
+ "mov %6, %%"REG_c" \n\t"
+ "mov %5, %%"REG_b" \n\t"
+ "mov %3, %%"REG_S" \n\t"
+ "pcmpeqd %%mm4, %%mm4 \n\t"
+ "pslld $31, %%mm4 \n\t"
+ "pxor %%mm7, %%mm7 \n\t" /* 0 */
+ "psrld $24, %%mm4 \n\t" /* FRAC_BITS >> 1 */
+
+ "1: \n\t"
+ "movd (%%"REG_S"), %%mm0 \n\t"
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 16(%%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd (%%"REG_d"), %%mm5 \n\t"
+ "mov %1, %%"REG_D" \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movd (%%"REG_d"), %%mm6 \n\t"
+ "pmullw %%mm0, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+
+ "movd 512(%%"REG_S"), %%mm0 \n\t"
+ "pmullw %%mm1, %%mm6 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 528(%%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd (%%"REG_d"), %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%mm5, %%mm6 \n\t"
+ "mov (%%"REG_D"), %%"REG_D" \n\t"
+ "pmullw %%mm0, %%mm2 \n\t"
+ "movd (%%"REG_d"), %%mm3 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "add %2, %%"REG_D" \n\t"
+ "paddusw %%mm2, %%mm6 \n\t"
+ "pmullw %%mm1, %%mm3 \n\t"
+ "paddusw %%mm3, %%mm6 \n\t"
+
+ "movq (%%"REG_D"), %%mm3 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "movq 8(%%"REG_D"), %%mm5 \n\t"
+ "punpckhwd %%mm7, %%mm6 \n\t"
+ "movd 20(%%"REG_S"), %%mm1 \n\t"
+ "punpcklwd %%mm7, %%mm0 \n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "paddd %%mm6, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "paddd %%mm4, %%mm3 \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "movd 4(%%"REG_S"), %%mm0 \n\t"
+ "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
+
+ "packssdw %%mm5, %%mm3 \n\t"
+ "packuswb %%mm7, %%mm3 \n\t"
+
+ "movd %%mm3, (%%"REG_d") \n\t"
+
+
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 4(%%"REG_d"), %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movd 4(%%"REG_d"), %%mm6 \n\t"
+ "pmullw %%mm0, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+
+ "movd 516(%%"REG_S"), %%mm0 \n\t"
+ "pmullw %%mm1, %%mm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 532(%%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd 4(%%"REG_d"), %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%mm5, %%mm6 \n\t"
+ "pmullw %%mm0, %%mm2 \n\t"
+ "movd 4(%%"REG_d"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm6 \n\t"
+ "pmullw %%mm1, %%mm3 \n\t"
+ "paddusw %%mm3, %%mm6 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+
+ "movq 16(%%"REG_D"), %%mm3 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "movq 24(%%"REG_D"), %%mm5 \n\t"
+ "punpckhwd %%mm7, %%mm6 \n\t"
+ "punpcklwd %%mm7, %%mm0 \n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "paddd %%mm6, %%mm5 \n\t"
+ "paddd %%mm4, %%mm3 \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
+
+ "packssdw %%mm5, %%mm3 \n\t"
+ "packuswb %%mm7, %%mm3 \n\t"
+
+ "movd %%mm3, 4(%%"REG_d") \n\t"
+
+
+
+ "movd 8(%%"REG_S"), %%mm0 \n\t"
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 24(%%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd 8(%%"REG_d"), %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movd 8(%%"REG_d"), %%mm6 \n\t"
+ "pmullw %%mm0, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+
+ "movd 520(%%"REG_S"), %%mm0 \n\t"
+ "pmullw %%mm1, %%mm6 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 536(%%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd 8(%%"REG_d"), %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%mm5, %%mm6 \n\t"
+ "pmullw %%mm0, %%mm2 \n\t"
+ "movd 8(%%"REG_d"), %%mm3 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm6 \n\t"
+ "pmullw %%mm1, %%mm3 \n\t"
+ "paddusw %%mm3, %%mm6 \n\t"
+
+ "movq 32(%%"REG_D"), %%mm3 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "movq 40(%%"REG_D"), %%mm5 \n\t"
+ "punpckhwd %%mm7, %%mm6 \n\t"
+ "movd 28(%%"REG_S"), %%mm1 \n\t"
+ "punpcklwd %%mm7, %%mm0 \n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "paddd %%mm6, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "paddd %%mm4, %%mm3 \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "movd 12(%%"REG_S"), %%mm0 \n\t"
+ "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
+
+ "packssdw %%mm5, %%mm3 \n\t"
+ "packuswb %%mm7, %%mm3 \n\t"
+
+ "movd %%mm3, 8(%%"REG_d") \n\t"
+
+
+ "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 12(%%"REG_d"), %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
+ "movd 12(%%"REG_d"), %%mm6 \n\t"
+ "pmullw %%mm0, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+
+ "movd 524(%%"REG_S"), %%mm0 \n\t"
+ "pmullw %%mm1, %%mm6 \n\t"
+ "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "movd 540(%%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd 12(%%"REG_d"), %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "mov (%%"REG_a"), %%"REG_d" \n\t"
+ "paddusw %%mm5, %%mm6 \n\t"
+ "pmullw %%mm0, %%mm2 \n\t"
+ "movd 12(%%"REG_d"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm6 \n\t"
+ "pmullw %%mm1, %%mm3 \n\t"
+ "paddusw %%mm3, %%mm6 \n\t"
+ "mov %0, %%"REG_d" \n\t"
+
+ "movq 48(%%"REG_D"), %%mm3 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "movq 56(%%"REG_D"), %%mm5 \n\t"
+ "punpckhwd %%mm7, %%mm6 \n\t"
+ "punpcklwd %%mm7, %%mm0 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
+ "paddd %%mm6, %%mm5 \n\t"
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
+ "paddd %%mm4, %%mm3 \n\t"
+ "add %%"REG_c", (%%"REG_a") \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
+ "add $"PTR_SIZE"*1, %1 \n\t"
+ "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
+ "add $32, %%"REG_S" \n\t"
+
+ "packssdw %%mm5, %%mm3 \n\t"
+ "add %%"REG_c", %0 \n\t"
+ "packuswb %%mm7, %%mm3 \n\t"
+
+ "movd %%mm3, 12(%%"REG_d") \n\t"
+
+ "dec %%"REG_b" \n\t"
+ "jnz 1b \n\t"
+ :
+ :
+ "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):
+ "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+}
+
+void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16) {
+ if (!(b_h & 1))
+ inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ } else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16)
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
More information about the ffmpeg-devel
mailing list