[FFmpeg-devel] [PATCH] A rather simple H.264 speed optimization
Jason Garrett-Glaser
darkshikari
Mon Jul 28 17:44:06 CEST 2008
On Mon, Jul 28, 2008 at 9:19 AM, Michael Niedermayer <michaelni at gmx.at> wrote:
> On Sun, Jul 27, 2008 at 10:50:29PM -0600, Jason Garrett-Glaser wrote:
>> $subject, gains 6 clock cycles or so per decode_cabac_residual call on
>> an ordinary source.
>
> ok
applied
round 2:
I used the extra casts in the p[0] p[8] etc cases for clarity, to
avoid changing the array indices.
Index: libavcodec/h264.c
===================================================================
--- libavcodec/h264.c (revision 14459)
+++ libavcodec/h264.c (working copy)
@@ -354,19 +354,15 @@
if(USES_LIST(top_type, list)){
const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
- *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]=
*(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
- *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]=
*(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
- *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]=
*(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
- *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]=
*(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
+ *(uint64_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]=
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 0];
+ *(uint64_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]=
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 2];
h->ref_cache[list][scan8[0] + 0 - 1*8]=
h->ref_cache[list][scan8[0] + 1 - 1*8]=
s->current_picture.ref_index[list][b8_xy + 0];
h->ref_cache[list][scan8[0] + 2 - 1*8]=
h->ref_cache[list][scan8[0] + 3 - 1*8]=
s->current_picture.ref_index[list][b8_xy + 1];
}else{
- *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
- *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
- *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
- *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
+ *(uint64_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
+ *(uint64_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]= 0;
*(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]=
((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
}
@@ -428,15 +424,11 @@
/* XXX beurk, Load mvd */
if(USES_LIST(top_type, list)){
const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
- *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 -
1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
- *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 -
1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
- *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 -
1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
- *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 -
1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
+ *(uint64_t*)h->mvd_cache[list][scan8[0] + 0 -
1*8]= *(uint64_t*)h->mvd_table[list][b_xy + 0];
+ *(uint64_t*)h->mvd_cache[list][scan8[0] + 2 -
1*8]= *(uint64_t*)h->mvd_table[list][b_xy + 2];
}else{
- *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
- *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
- *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
- *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
+ *(uint64_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
+ *(uint64_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]= 0;
}
if(USES_LIST(left_type[0], list)){
const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
@@ -5698,6 +5690,7 @@
int mpx, mpy;
int mx, my;
const int index= 4*i + block_width*j;
+ uint32_t mv, mvd;
int16_t (* mv_cache)[2]= &h->mv_cache[list][
scan8[index] ];
int16_t (* mvd_cache)[2]=
&h->mvd_cache[list][ scan8[index] ];
pred_motion(h, index, block_width, list,
h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
@@ -5706,40 +5699,26 @@
my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
tprintf(s->avctx, "final mv:%d %d\n", mx, my);
+ mv = pack16to32( mx, my );
+ mvd = pack16to32( mx - mpx, my - mpy );
if(IS_SUB_8X8(sub_mb_type)){
- mv_cache[ 1 ][0]=
- mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
- mv_cache[ 1 ][1]=
- mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
-
- mvd_cache[ 1 ][0]=
- mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
- mvd_cache[ 1 ][1]=
- mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
+ *(uint32_t*) mv_cache[1]=*(uint32_t*)
mv_cache[8]=*(uint32_t*) mv_cache[9]= mv;
+
*(uint32_t*)mvd_cache[1]=*(uint32_t*)mvd_cache[8]=*(uint32_t*)mvd_cache[9]=
mvd;
}else if(IS_SUB_8X4(sub_mb_type)){
- mv_cache[ 1 ][0]= mx;
- mv_cache[ 1 ][1]= my;
-
- mvd_cache[ 1 ][0]= mx - mpx;
- mvd_cache[ 1 ][1]= my - mpy;
+ *(uint32_t*) mv_cache[1]= mv;
+ *(uint32_t*)mvd_cache[1]= mvd;
}else if(IS_SUB_4X8(sub_mb_type)){
- mv_cache[ 8 ][0]= mx;
- mv_cache[ 8 ][1]= my;
-
- mvd_cache[ 8 ][0]= mx - mpx;
- mvd_cache[ 8 ][1]= my - mpy;
+ *(uint32_t*) mv_cache[8]= mv;
+ *(uint32_t*)mvd_cache[8]= mvd;
}
- mv_cache[ 0 ][0]= mx;
- mv_cache[ 0 ][1]= my;
-
- mvd_cache[ 0 ][0]= mx - mpx;
- mvd_cache[ 0 ][1]= my - mpy;
+ *(uint32_t*) mv_cache[0]= mv;
+ *(uint32_t*)mvd_cache[0]= mvd;
}
}else{
- uint32_t *p= (uint32_t *)&h->mv_cache[list][
scan8[4*i] ][0];
- uint32_t *pd= (uint32_t *)&h->mvd_cache[list][
scan8[4*i] ][0];
- p[0] = p[1] = p[8] = p[9] = 0;
- pd[0]= pd[1]= pd[8]= pd[9]= 0;
+ uint32_t *p = (uint32_t*)&h->mv_cache[list][
scan8[4*i] ][0];
+ uint32_t *pd= (uint32_t*)&h->mvd_cache[list][
scan8[4*i] ][0];
+ *(uint64_t*)&p[0] = *(uint64_t*)&p[8] = 0;
+ *(uint64_t*)&pd[0]= *(uint64_t*)&pd[8]= 0;
}
}
}
@@ -5908,7 +5887,7 @@
}
} else {
uint8_t * const nnz= &h->non_zero_count_cache[
scan8[4*i8x8] ];
- nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+ *(uint16_t*)&nnz[0] = *(uint16_t*)&nnz[8] = 0;
}
}
}
@@ -5933,14 +5912,14 @@
}
} else {
uint8_t * const nnz= &h->non_zero_count_cache[0];
- nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8
] =nnz[ scan8[16]+9 ] =
- nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8
] =nnz[ scan8[20]+9 ] = 0;
+ *(uint16_t*)&nnz[ scan8[16]+0 ] = *(uint16_t*)&nnz[ scan8[16]+8 ] =
+ *(uint16_t*)&nnz[ scan8[20]+0 ] = *(uint16_t*)&nnz[
scan8[20]+8 ] = 0;
}
} else {
uint8_t * const nnz= &h->non_zero_count_cache[0];
fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
- nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ]
=nnz[ scan8[16]+9 ] =
- nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ]
=nnz[ scan8[20]+9 ] = 0;
+ *(uint16_t*)&nnz[ scan8[16]+0 ] = *(uint16_t*)&nnz[ scan8[16]+8 ] =
+ *(uint16_t*)&nnz[ scan8[20]+0 ] = *(uint16_t*)&nnz[ scan8[20]+8 ] = 0;
h->last_qscale_diff = 0;
}
More information about the ffmpeg-devel
mailing list