[Ffmpeg-devel] [PATCH] Snow mc_block mmx optimization

Fri Mar 24 15:18:04 CET 2006

On Thu, Mar 23, 2006 at 09:43:29PM +0100, Guillaume POIRIER wrote:
> Hi,
> 
> On 3/22/06, Oded Shimon <ods15 at ods15.dyndns.org> wrote:
> > My turn!
> >
> > Just for qpel (I think? maybe also for odd resolutions)
> >
> > C version is faster as well, by a factor of ~2, and mmx version is about ~8
> > times faster... md5sums pass. I think total speed increase is 10-20% for
> > files encoded with qpel...
> 
> Doesn't apply here...

New patch. Also has the _mmx function names you wanted.

- ods15
-------------- next part --------------
Index: libavcodec/dsputil.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.c,v
retrieving revision 1.137
diff -u -r1.137 dsputil.c

--- libavcodec/dsputil.c	23 Mar 2006 20:16:35 -0000	1.137
+++ libavcodec/dsputil.c	24 Mar 2006 14:13:27 -0000
@@ -3775,6 +3775,85 @@
 
 static void just_return() { return; }
 
+static always_inline void mc_block_x(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+    if (dy == 0) {
+        b_h -= 5;
+        tmp = dst;
+        src += 2*stride;
+    }
+    if (dx != 0) for (y = 0; y < b_h+5; y++) {
+        for (x = 0; x < b_w; x++) {
+            int a0= src[x    ];
+            int a1= src[x + 1];
+            int a2= src[x + 2];
+            int a3= src[x + 3];
+            int a4= src[x + 4];
+            int a5= src[x + 5];
+            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+
+            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
+            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
+
+            if(am&(~255)) am= ~(am>>31);
+
+            tmp[x] = am;
+        }
+        tmp += stride;
+        src += stride;
+    }
+}
+
+static always_inline void mc_block_y(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+    if (dx == 0) tmp = (uint8_t*)src + 2;
+    if (dy != 0) for (y = 0; y < b_h; y++) {
+        for (x = 0; x < b_w; x++) {
+            int a0= tmp[x + 0*stride];
+            int a1= tmp[x + 1*stride];
+            int a2= tmp[x + 2*stride];
+            int a3= tmp[x + 3*stride];
+            int a4= tmp[x + 4*stride];
+            int a5= tmp[x + 5*stride];
+            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+
+            if(dy<8) am = (32*a2*( 8-dy) +    am* dy    + 128)>>8;
+            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
+
+            if(am&(~255)) am= ~(am>>31);
+
+            dst[x] = am;
+        }
+        dst += stride;
+        tmp += stride;
+    } else if (dx == 0) { // do nothing! memcpy
+        tmp += 2*stride;
+        for (y = 0; y < b_h; y++) {
+            memcpy(dst, tmp, b_w);
+            dst += stride;
+            tmp += stride;
+        }
+    }
+}
+
+#define mca(a)\
+static void mc_block_x ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+    mc_block_x(dst, src, tmp, stride, b_w, b_h, a, dy);\
+}\
+static void mc_block_y ## a(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+    mc_block_y(dst, src, tmp, stride, b_w, b_h, dx, a);\
+}
+
+mca(0)
+mca(2)
+mca(4)
+mca(6)
+mca(8)
+mca(10)
+mca(12)
+mca(14)
+#undef mca
+
 /* init static data */
 void dsputil_static_init(void)
 {
@@ -4054,6 +4133,24 @@
     c->vertical_compose97i = ff_snow_vertical_compose97i;
     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
     c->inner_add_yblock = ff_snow_inner_add_yblock;
+
+    c->mc_block_x[0] = mc_block_x0;
+    c->mc_block_x[1] = mc_block_x2;
+    c->mc_block_x[2] = mc_block_x4;
+    c->mc_block_x[3] = mc_block_x6;
+    c->mc_block_x[4] = mc_block_x8;
+    c->mc_block_x[5] = mc_block_x10;
+    c->mc_block_x[6] = mc_block_x12;
+    c->mc_block_x[7] = mc_block_x14;
+
+    c->mc_block_y[0] = mc_block_y0;
+    c->mc_block_y[1] = mc_block_y2;
+    c->mc_block_y[2] = mc_block_y4;
+    c->mc_block_y[3] = mc_block_y6;
+    c->mc_block_y[4] = mc_block_y8;
+    c->mc_block_y[5] = mc_block_y10;
+    c->mc_block_y[6] = mc_block_y12;
+    c->mc_block_y[7] = mc_block_y14;
 #endif
 
     c->prefetch= just_return;
Index: libavcodec/dsputil.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.h,v
retrieving revision 1.132
diff -u -r1.132 dsputil.h
--- libavcodec/dsputil.h	24 Mar 2006 01:33:22 -0000	1.132
+++ libavcodec/dsputil.h	24 Mar 2006 14:13:28 -0000
@@ -133,6 +133,7 @@
 // allthough currently h<4 is not used as functions with width <8 are not used and neither implemented
 typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
 
+typedef void (*mc_block_func)(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy);
 
 // for snow slices
 typedef struct slice_buffer_s slice_buffer;
@@ -344,6 +345,9 @@
     void (*horizontal_compose97i)(DWTELEM *b, int width);
     void (*inner_add_yblock)(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
 
+    mc_block_func mc_block_x[8];
+    mc_block_func mc_block_y[8];
+
     void (*prefetch)(void *mem, int stride, int h);
 } DSPContext;
 
Index: libavcodec/snow.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/snow.c,v
retrieving revision 1.94
diff -u -r1.94 snow.c
--- libavcodec/snow.c	20 Mar 2006 05:52:23 -0000	1.94
+++ libavcodec/snow.c	24 Mar 2006 14:13:33 -0000
@@ -2294,91 +2294,6 @@
     }
 }
 
-static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
-    int x, y;
-START_TIMER
-    for(y=0; y < b_h+5; y++){
-        for(x=0; x < b_w; x++){
-            int a0= src[x    ];
-            int a1= src[x + 1];
-            int a2= src[x + 2];
-            int a3= src[x + 3];
-            int a4= src[x + 4];
-            int a5= src[x + 5];
-//            int am= 9*(a1+a2) - (a0+a3);
-            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
-//            int am= 18*(a2+a3) - 2*(a1+a4);
-//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
-//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
-
-//            if(b_w==16) am= 8*(a1+a2);
-
-            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
-            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
-
-            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
-            if(am&(~255)) am= ~(am>>31);
-
-            tmp[x] = am;
-
-/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
-            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
-            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
-            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
-        }
-        tmp += stride;
-        src += stride;
-    }
-    tmp -= (b_h+5)*stride;
-
-    for(y=0; y < b_h; y++){
-        for(x=0; x < b_w; x++){
-            int a0= tmp[x + 0*stride];
-            int a1= tmp[x + 1*stride];
-            int a2= tmp[x + 2*stride];
-            int a3= tmp[x + 3*stride];
-            int a4= tmp[x + 4*stride];
-            int a5= tmp[x + 5*stride];
-            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
-//            int am= 18*(a2+a3) - 2*(a1+a4);
-/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
-            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
-
-//            if(b_w==16) am= 8*(a1+a2);
-
-            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
-            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
-
-            if(am&(~255)) am= ~(am>>31);
-
-            dst[x] = am;
-/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
-            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
-            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
-            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
-        }
-        dst += stride;
-        tmp += stride;
-    }
-STOP_TIMER("mc_block")
-}
-
-#define mca(dx,dy,b_w)\
-static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
-    uint8_t tmp[stride*(b_w+5)];\
-    assert(h==b_w);\
-    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
-}
-
-mca( 0, 0,16)
-mca( 8, 0,16)
-mca( 0, 8,16)
-mca( 8, 8,16)
-mca( 0, 0,8)
-mca( 8, 0,8)
-mca( 0, 8,8)
-mca( 8, 8,8)
-
 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
     if(block->type & BLOCK_INTRA){
         int x, y;
@@ -2437,9 +2352,13 @@
 //        assert(!(b_w&(b_w-1)));
         assert(b_w>1 && b_h>1);
         assert(tab_index>=0 && tab_index<4 || b_w==32);
-        if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
-            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
-        else if(b_w==32){
+        if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1))) {
+            START_TIMER
+            assert(!(dx&1) && !(dy&1));
+            s->dsp.mc_block_x[dx>>1](dst, src, tmp, stride, b_w, b_h, dx, dy);
+            s->dsp.mc_block_y[dy>>1](dst, src, tmp, stride, b_w, b_h, dx, dy);
+            STOP_TIMER("mc_block")
+        } else if(b_w==32){
             int y;
             for(y=0; y<b_h; y+=16){
                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
@@ -3765,19 +3684,6 @@
     mcf( 8,12)
     mcf(12,12)
 
-#define mcfh(dx,dy)\
-    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
-    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
-        mc_block_hpel ## dx ## dy ## 16;\
-    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
-    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
-        mc_block_hpel ## dx ## dy ## 8;
-
-    mcfh(0, 0)
-    mcfh(8, 0)
-    mcfh(0, 8)
-    mcfh(8, 8)
-
     if(!qexp[0])
         init_qexp();
 
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx.c,v
retrieving revision 1.118
diff -u -r1.118 dsputil_mmx.c
--- libavcodec/i386/dsputil_mmx.c	23 Mar 2006 20:16:36 -0000	1.118
+++ libavcodec/i386/dsputil_mmx.c	24 Mar 2006 14:13:35 -0000
@@ -2585,6 +2585,181 @@
                            int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
 extern void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+static always_inline void mc_block_core_mmx(int dx, uint8_t * dst) {
+    asm volatile("punpcklbw  %%mm6, %%mm0     \n\t"
+                 "punpcklbw  %%mm6, %%mm1     \n\t"
+                 "punpcklbw  %%mm6, %%mm2     \n\t"
+                 "punpcklbw  %%mm6, %%mm3     \n\t"
+                 "punpcklbw  %%mm6, %%mm4     \n\t"
+                 "punpcklbw  %%mm6, %%mm5     \n\t"
+
+                 "paddsw     %%mm5, %%mm0     \n\t" // am += a5;
+
+                 "paddsw     %%mm4, %%mm1     \n\t" // a1 += a4;
+                 "movq       %%mm1, %%mm4     \n\t" // a4  = a1;
+                 "psllw         $2, %%mm1     \n\t" // a1 *=  4;
+                 "paddsw     %%mm4, %%mm1     \n\t" // a1 += a4;
+                 "psubsw     %%mm1, %%mm0     \n\t" // am -= a1;
+
+                 "movq       %%mm2, %%mm4     \n\t" // a4  = a2;
+                 "paddsw     %%mm3, %%mm4     \n\t" // a4 += a3;
+
+                 "psllw         $2, %%mm4     \n\t" // a4 *=  4;
+                 "paddsw     %%mm4, %%mm0     \n\t" // am += a1;
+                 "psllw         $2, %%mm4     \n\t" // a4 *=  4;
+                 "paddsw     %%mm4, %%mm0     \n\t" // am += a1;
+    ::);
+    switch (dx) {
+        case  2: asm volatile("psllw         $5, %%mm2     \n\t" // a2 <<= 5;
+                              "movq       %%mm2, %%mm3     \n\t" // a3   = a2;
+                              "psllw         $1, %%mm2     \n\t" // a2 <<= 1;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              ::); break;
+        case  4: asm volatile("psllw         $6, %%mm2     \n\t" // a2 <<= 6;
+                              "psllw         $1, %%mm0     \n\t" // am <<= 1;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              ::); break;
+        case  6: asm volatile("psllw         $5, %%mm2     \n\t" // a2 <<= 5;
+                              "movq       %%mm0, %%mm3     \n\t" // a3   = am;
+                              "psllw         $1, %%mm0     \n\t" // am <<= 1;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              ::); break;
+        case  8: asm volatile("psllw         $2, %%mm0     \n\t" // am <<= 2;
+                              ::); break;
+        case 10: asm volatile("psllw         $5, %%mm3     \n\t" // a3 <<= 5;
+                              "movq       %%mm0, %%mm2     \n\t" // a2   = am;
+                              "psllw         $1, %%mm0     \n\t" // am <<= 1;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              ::); break;
+        case 12: asm volatile("psllw         $6, %%mm3     \n\t" // a3 <<= 6;
+                              "psllw         $1, %%mm0     \n\t" // am <<= 1;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              ::); break;
+        case 14: asm volatile("psllw         $5, %%mm3     \n\t" // a3 <<= 5;
+                              "movq       %%mm3, %%mm2     \n\t" // a2   = a3;
+                              "psllw         $1, %%mm3     \n\t" // a3 <<= 1;
+                              "paddsw     %%mm2, %%mm0     \n\t" // am  += a2;
+                              "paddsw     %%mm3, %%mm0     \n\t" // am  += a3;
+                              ::); break;
+    }
+    asm volatile("paddsw     %%mm7, %%mm0     \n\t" // am += 64;
+                 "psraw         $7, %%mm0     \n\t" // am >>= 7;
+                 "packuswb   %%mm6, %%mm0     \n\t"
+                 "movd       %%mm0, (%0)      \n\t" // tmp[x] = am;
+    ::"r"(dst));
+}
+
+static always_inline void mc_block_x_mmx(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+    assert(!(b_w&3) && !(b_h&3) && !(dx&1));
+    asm volatile("pcmpeqw    %%mm7, %%mm7     \n\t"
+                 "psllw        $15, %%mm7     \n\t"
+                 "psrlw         $9, %%mm7     \n\t" // 64
+                 "pxor       %%mm6, %%mm6     \n\t" // 0
+    ::);
+    if (dy == 0) {
+        b_h -= 5;
+        tmp = dst;
+        src += 2*stride;
+    }
+    if (dx != 0) for (y = 0; y < b_h+5; y++) {
+        for (x = 0; x < b_w-3; x += 4) {
+            asm volatile("movd        (%0), %%mm0     \n\t" // am  = src[x    ];
+                         "movd       1(%0), %%mm1     \n\t" // a1  = src[x + 1];
+                         "movd       2(%0), %%mm2     \n\t" // a2  = src[x + 2];
+                         "movd       3(%0), %%mm3     \n\t" // a3  = src[x + 3];
+                         "movd       4(%0), %%mm4     \n\t" // a4  = src[x + 4];
+                         "movd       5(%0), %%mm5     \n\t" // a5  = src[x + 5];
+            ::"r"(&src[x]));
+
+            mc_block_core_mmx(dx, &tmp[x]);
+        }
+        for (; x < b_w; x++) {
+            int a0= src[x    ];
+            int a1= src[x + 1];
+            int a2= src[x + 2];
+            int a3= src[x + 3];
+            int a4= src[x + 4];
+            int a5= src[x + 5];
+            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+
+            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
+            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
+
+            if(am&(~255)) am= ~(am>>31);
+
+            tmp[x] = am;
+        }
+        tmp += stride;
+        src += stride;
+    }
+}
+
+static always_inline void mc_block_y_mmx(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+    if (dx == 0) tmp = (uint8_t*)src + 2;
+    if (dy != 0) for (y = 0; y < b_h; y++) {
+        for (x = 0; x < b_w-3; x += 4) {
+            asm volatile("movd        (%0), %%mm0     \n\t" // am  = tmp[x + 0*stride];
+                         "movd        (%1), %%mm1     \n\t" // a1  = tmp[x + 1*stride];
+                         "movd   (%0,%2,2), %%mm2     \n\t" // a2  = tmp[x + 2*stride];
+                         "movd   (%1,%2,2), %%mm3     \n\t" // a3  = tmp[x + 3*stride];
+                         "movd   (%0,%2,4), %%mm4     \n\t" // a4  = tmp[x + 4*stride];
+                         "movd   (%1,%2,4), %%mm5     \n\t" // a5  = tmp[x + 5*stride];
+            ::"r"(&tmp[x]),"r"(&tmp[x+stride]),"a"(stride));
+
+            mc_block_core_mmx(dy, &dst[x]);
+        }
+        for (; x < b_w; x++){
+            int a0= tmp[x + 0*stride];
+            int a1= tmp[x + 1*stride];
+            int a2= tmp[x + 2*stride];
+            int a3= tmp[x + 3*stride];
+            int a4= tmp[x + 4*stride];
+            int a5= tmp[x + 5*stride];
+            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+
+            if(dy<8) am = (32*a2*( 8-dy) +    am* dy    + 128)>>8;
+            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
+
+            if(am&(~255)) am= ~(am>>31);
+
+            dst[x] = am;
+        }
+        dst += stride;
+        tmp += stride;
+    } else if (dx == 0) { // do nothing! memcpy
+        tmp += 2*stride;
+        for (y = 0; y < b_h; y++) {
+            memcpy(dst, tmp, b_w);
+            dst += stride;
+            tmp += stride;
+        }
+    }
+    asm volatile("emms"::);
+}
+
+#define mca(a)\
+static void mc_block_x ## a ## _mmx(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+    mc_block_x_mmx(dst, src, tmp, stride, b_w, b_h, a, dy);\
+}\
+static void mc_block_y ## a ## _mmx(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){\
+    mc_block_y_mmx(dst, src, tmp, stride, b_w, b_h, dx, a);\
+}
+
+mca(0)
+mca(2)
+mca(4)
+mca(6)
+mca(8)
+mca(10)
+mca(12)
+mca(14)
+
 #endif
 
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
@@ -2991,6 +3166,24 @@
             c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
             c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
         }
+
+        c->mc_block_x[0] = mc_block_x0_mmx;
+        c->mc_block_x[1] = mc_block_x2_mmx;
+        c->mc_block_x[2] = mc_block_x4_mmx;
+        c->mc_block_x[3] = mc_block_x6_mmx;
+        c->mc_block_x[4] = mc_block_x8_mmx;
+        c->mc_block_x[5] = mc_block_x10_mmx;
+        c->mc_block_x[6] = mc_block_x12_mmx;
+        c->mc_block_x[7] = mc_block_x14_mmx;
+
+        c->mc_block_y[0] = mc_block_y0_mmx;
+        c->mc_block_y[1] = mc_block_y2_mmx;
+        c->mc_block_y[2] = mc_block_y4_mmx;
+        c->mc_block_y[3] = mc_block_y6_mmx;
+        c->mc_block_y[4] = mc_block_y8_mmx;
+        c->mc_block_y[5] = mc_block_y10_mmx;
+        c->mc_block_y[6] = mc_block_y12_mmx;
+        c->mc_block_y[7] = mc_block_y14_mmx;
 #endif
     }