[FFmpeg-devel] [PATCH] A rather simple H.264 speed optimization

Mon Jul 28 17:44:06 CEST 2008

On Mon, Jul 28, 2008 at 9:19 AM, Michael Niedermayer <michaelni at gmx.at> wrote:
> On Sun, Jul 27, 2008 at 10:50:29PM -0600, Jason Garrett-Glaser wrote:
>> $subject, gains 6 clock cycles or so per decode_cabac_residual call on
>> an ordinary source.
>
> ok

applied

round 2:

I used the extra casts in the p[0] p[8] etc cases for clarity, to
avoid changing the array indices.

Index: libavcodec/h264.c
===================================================================

--- libavcodec/h264.c	(revision 14459)
+++ libavcodec/h264.c	(working copy)
@@ -354,19 +354,15 @@
             if(USES_LIST(top_type, list)){
                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
-                *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]=
*(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
-                *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]=
*(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
-                *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]=
*(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
-                *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]=
*(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
+                *(uint64_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]=
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 0];
+                *(uint64_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]=
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 2];
                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
                 h->ref_cache[list][scan8[0] + 1 - 1*8]=
s->current_picture.ref_index[list][b8_xy + 0];
                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
                 h->ref_cache[list][scan8[0] + 3 - 1*8]=
s->current_picture.ref_index[list][b8_xy + 1];
             }else{
-                *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
-                *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
-                *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
-                *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
+                *(uint64_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
+                *(uint64_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]= 0;
                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]=
((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
             }

@@ -428,15 +424,11 @@
                 /* XXX beurk, Load mvd */
                 if(USES_LIST(top_type, list)){
                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 -
1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 -
1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 -
1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 -
1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
+                    *(uint64_t*)h->mvd_cache[list][scan8[0] + 0 -
1*8]= *(uint64_t*)h->mvd_table[list][b_xy + 0];
+                    *(uint64_t*)h->mvd_cache[list][scan8[0] + 2 -
1*8]= *(uint64_t*)h->mvd_table[list][b_xy + 2];
                 }else{
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
-                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
+                    *(uint64_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
+                    *(uint64_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]= 0;
                 }
                 if(USES_LIST(left_type[0], list)){
                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
@@ -5698,6 +5690,7 @@
                         int mpx, mpy;
                         int mx, my;
                         const int index= 4*i + block_width*j;
+                        uint32_t mv, mvd;
                         int16_t (* mv_cache)[2]= &h->mv_cache[list][
scan8[index] ];
                         int16_t (* mvd_cache)[2]=
&h->mvd_cache[list][ scan8[index] ];
                         pred_motion(h, index, block_width, list,
h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
@@ -5706,40 +5699,26 @@
                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);

+                        mv  = pack16to32( mx, my );
+                        mvd = pack16to32( mx - mpx, my - mpy );
                         if(IS_SUB_8X8(sub_mb_type)){
-                            mv_cache[ 1 ][0]=
-                            mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
-                            mv_cache[ 1 ][1]=
-                            mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
-
-                            mvd_cache[ 1 ][0]=
-                            mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
-                            mvd_cache[ 1 ][1]=
-                            mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
+                            *(uint32_t*) mv_cache[1]=*(uint32_t*)
mv_cache[8]=*(uint32_t*) mv_cache[9]= mv;
+
*(uint32_t*)mvd_cache[1]=*(uint32_t*)mvd_cache[8]=*(uint32_t*)mvd_cache[9]=
mvd;
                         }else if(IS_SUB_8X4(sub_mb_type)){
-                            mv_cache[ 1 ][0]= mx;
-                            mv_cache[ 1 ][1]= my;
-
-                            mvd_cache[ 1 ][0]= mx - mpx;
-                            mvd_cache[ 1 ][1]= my - mpy;
+                            *(uint32_t*) mv_cache[1]= mv;
+                            *(uint32_t*)mvd_cache[1]= mvd;
                         }else if(IS_SUB_4X8(sub_mb_type)){
-                            mv_cache[ 8 ][0]= mx;
-                            mv_cache[ 8 ][1]= my;
-
-                            mvd_cache[ 8 ][0]= mx - mpx;
-                            mvd_cache[ 8 ][1]= my - mpy;
+                            *(uint32_t*) mv_cache[8]= mv;
+                            *(uint32_t*)mvd_cache[8]= mvd;
                         }
-                        mv_cache[ 0 ][0]= mx;
-                        mv_cache[ 0 ][1]= my;
-
-                        mvd_cache[ 0 ][0]= mx - mpx;
-                        mvd_cache[ 0 ][1]= my - mpy;
+                        *(uint32_t*) mv_cache[0]= mv;
+                        *(uint32_t*)mvd_cache[0]= mvd;
                     }
                 }else{
-                    uint32_t *p= (uint32_t *)&h->mv_cache[list][
scan8[4*i] ][0];
-                    uint32_t *pd= (uint32_t *)&h->mvd_cache[list][
scan8[4*i] ][0];
-                    p[0] = p[1] = p[8] = p[9] = 0;
-                    pd[0]= pd[1]= pd[8]= pd[9]= 0;
+                    uint32_t *p = (uint32_t*)&h->mv_cache[list][
scan8[4*i] ][0];
+                    uint32_t *pd= (uint32_t*)&h->mvd_cache[list][
scan8[4*i] ][0];
+                    *(uint64_t*)&p[0] = *(uint64_t*)&p[8] = 0;
+                    *(uint64_t*)&pd[0]= *(uint64_t*)&pd[8]= 0;
                 }
             }
         }
@@ -5908,7 +5887,7 @@
                     }
                 } else {
                     uint8_t * const nnz= &h->non_zero_count_cache[
scan8[4*i8x8] ];
-                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+                    *(uint16_t*)&nnz[0] = *(uint16_t*)&nnz[8] = 0;
                 }
             }
         }
@@ -5933,14 +5912,14 @@
             }
         } else {
             uint8_t * const nnz= &h->non_zero_count_cache[0];
-            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8
] =nnz[ scan8[16]+9 ] =
-            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8
] =nnz[ scan8[20]+9 ] = 0;
+            *(uint16_t*)&nnz[ scan8[16]+0 ] = *(uint16_t*)&nnz[ scan8[16]+8 ] =
+            *(uint16_t*)&nnz[ scan8[20]+0 ] = *(uint16_t*)&nnz[
scan8[20]+8 ] = 0;
         }
     } else {
         uint8_t * const nnz= &h->non_zero_count_cache[0];
         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
-        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ]
=nnz[ scan8[16]+9 ] =
-        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ]
=nnz[ scan8[20]+9 ] = 0;
+        *(uint16_t*)&nnz[ scan8[16]+0 ] = *(uint16_t*)&nnz[ scan8[16]+8 ] =
+        *(uint16_t*)&nnz[ scan8[20]+0 ] = *(uint16_t*)&nnz[ scan8[20]+8 ] = 0;
         h->last_qscale_diff = 0;
     }