[FFmpeg-devel] [FFmpeg-devel-irc] IRC log for 2010-02-19
Michael Niedermayer
michaelni
Wed Feb 24 13:50:42 CET 2010
On Sun, Feb 21, 2010 at 03:06:43PM -0800, Jason Garrett-Glaser wrote:
> On Sun, Feb 21, 2010 at 6:57 AM, Michael Niedermayer <michaelni at gmx.at> wrote:
> > On Sat, Feb 20, 2010 at 12:00:54AM +0000, irc at mansr.com wrote:
> >> [00:00:16] <mru> if speed matters you should use asm
> >> [00:00:27] <Dark_Shikari> we only have inline asm on x86 atm
> >> [00:00:47] <mru> sometimes inline asm is the best solution
> >> [00:01:17] <Dark_Shikari> hmm. michael's idea seems to hurt in x264.
> >> [00:02:02] <Dark_Shikari> in fact, michael's idea would hurt in ffmpeg if ffmpeg had inline SIMD like x264 did
> >> [00:02:05] <Dark_Shikari> for that code
> >> [00:02:11] <Dark_Shikari> we use simd for the following
> >> [00:02:12] <Dark_Shikari> MIN(((x+28)*2184)>>16,2) = (x>2) + (x>32)
> >> [00:02:20] <Dark_Shikari> on two values at once
> >> [00:02:28] <Dark_Shikari> left side is simd, right side is C
> >
> > any volunteers who would send a patch?
>
> Note that since, I've changed that code locally due to some
> inspiration from your patch ;)
>
> Here's the current asm, which calculates (x>2)+(x>32) for two values
> at once. I don't think it's much better than C anymore; the main
> advantage before was that it saved 2 abs() calls, but your idea
> eliminates the need for that.
>
> static const uint64_t pb_2 = 0x0202020202020202ULL;
> static const uint64_t pb_32 = 0x2020202020202020ULL;
> int amvd;
> asm(
> "movd %1, %%mm0 \n"
> "movd %2, %%mm1 \n"
> "paddb %%mm1, %%mm0 \n"
> "pxor %%mm2, %%mm2 \n"
> "movq %%mm0, %%mm1 \n"
> "pcmpgtb %3, %%mm0 \n"
> "pcmpgtb %4, %%mm1 \n"
> "psubb %%mm0, %%mm2 \n"
> "psubb %%mm1, %%mm2 \n"
> "movd %%mm2, %0 \n"
> :"=r"(amvd)
> :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
> "m"(pb_2),"m"(pb_32)
> );
>
> Note how the input is bytes (!). Here's the trick: MVD values only
> have to be 0 to 33; any larger value tells us nothing. Maybe there's
> some scaling that goes on with MBAFF, but even then it's only 0-65.
> As a result, you can store MVD values as uint8_ts, saving enormous
> amounts of memory and cache and making fill_rectangle faster.
> Obviously this requires a little bit of extra clipping, but my
> benchmarks in x264 show that it's worth it there.
>
> This change is probably vastly more useful than the above asm (which I
> make available under LGPL in case anyone cares, but it's probably
> near-useless once the other changes are done).
I tried to switch from 16 -> 8 bit
mvd1616 goes from 283cycles -> 292cycles and mvload 108->114cycles
mvd88 stays at 354
i did not look at the gcc generated asm (did just eat)
patch below
if someone can get it faster thats welcome
Index: libavcodec/h264.c
===================================================================
--- libavcodec/h264.c (revision 21999)
+++ libavcodec/h264.c (working copy)
@@ -756,8 +762,8 @@
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
- FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
- FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
+ FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint8_t), fail);
+ FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint8_t), fail);
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail)
Index: libavcodec/h264_cabac.c
===================================================================
--- libavcodec/h264_cabac.c (revision 21999)
+++ libavcodec/h264_cabac.c (working copy)
@@ -1418,7 +1427,7 @@
for(i=0; i<4; i++){
h->ref_cache[list][ scan8[4*i] ]=h->ref_cache[list][ scan8[4*i]+1 ];
if(IS_DIRECT(h->sub_mb_type[i])){
- fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
+ fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2);
continue;
}
@@ -1430,15 +1439,18 @@
int mx, my;
const int index= 4*i + block_width*j;
int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
- int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
+ uint8_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
+START_TIMER
mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
tprintf(s->avctx, "final mv:%d %d\n", mx, my);
mpx= abs(mpx-mx);
mpy= abs(mpy-my);
+ if(mpx>70) mpx=70;
+ if(mpy>70) mpy=70;
if(IS_SUB_8X8(sub_mb_type)){
mv_cache[ 1 ][0]=
mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
@@ -1467,19 +1479,20 @@
mvd_cache[ 0 ][0]= mpx;
mvd_cache[ 0 ][1]= mpy;
+STOP_TIMER("mvd88")
}
}else{
uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
- p[0] = p[1] = p[8] = p[9] = 0;
- pd[0]= pd[1]= pd[8]= pd[9]= 0;
+ p[0] = p[1] = p[8] = p[9] = 0; //64bitwrite
+ pd[0]= pd[4]= 0;
}
}
}
} else if( IS_DIRECT(mb_type) ) {
ff_h264_pred_direct_motion(h, &mb_type);
- fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
- fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+ fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2);
+ fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2);
dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
} else {
int list, mx, my, i, mpx, mpy;
@@ -1502,12 +1515,17 @@
for(list=0; list<h->list_count; list++){
if(IS_DIR(mb_type, 0, list)){
pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
-
+START_TIMER
mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
tprintf(s->avctx, "final mv:%d %d\n", mx, my);
- fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(abs(mx-mpx),abs(my-mpy)), 4);
+ mpx= abs(mx-mpx);
+ mpy= abs(my-mpy);
+ if(mpx>70) mpx=70;
+ if(mpy>70) mpy=70;
+ fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2);
+STOP_TIMER("mvd1616")
fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
}else
fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
@@ -1539,10 +1557,14 @@
my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
tprintf(s->avctx, "final mv:%d %d\n", mx, my);
- fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(abs(mx-mpx),abs(my-mpy)), 4);
+ mpx= abs(mx-mpx);
+ mpy= abs(my-mpy);
+ if(mpx>70) mpx=70;
+ if(mpy>70) mpy=70;
+ fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2);
fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
}else{
- fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
+ fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2);
fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
}
}
@@ -1574,10 +1596,14 @@
my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
tprintf(s->avctx, "final mv:%d %d\n", mx, my);
- fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(abs(mx-mpx),abs(my-mpy)), 4);
+ mpx= abs(mx-mpx);
+ mpy= abs(my-mpy);
+ if(mpx>70) mpx=70;
+ if(mpy>70) mpy=70;
+ fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2);
fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
}else{
- fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
+ fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2);
fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
}
}
Index: libavcodec/h264.h
===================================================================
--- libavcodec/h264.h (revision 22013)
+++ libavcodec/h264.h (working copy)
@@ -486,8 +499,8 @@
/* chroma_pred_mode for i4x4 or i16x16, else 0 */
uint8_t *chroma_pred_mode_table;
int last_qscale_diff;
- int16_t (*mvd_table[2])[2];
- DECLARE_ALIGNED_16(int16_t, mvd_cache)[2][5*8][2];
+ uint8_t (*mvd_table[2])[2];
+ DECLARE_ALIGNED_16(uint8_t, mvd_cache)[2][5*8][2];
uint8_t *direct_table;
uint8_t direct_cache[5*8];
@@ -732,6 +745,14 @@
#endif
}
+static av_always_inline uint16_t pack8to16(int a, int b){
+#if HAVE_BIGENDIAN
+ return (b&0xFF) + (a<<8);
+#else
+ return (a&0xFF) + (b<<8);
+#endif
+}
+
/**
* gets the chroma qp.
*/
@@ -1058,34 +1084,35 @@
if( CABAC ) {
/* XXX beurk, Load mvd */
+START_TIMER
if(USES_LIST(top_type, list)){
const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
- AV_COPY128(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
+ AV_COPY64(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
}else{
- AV_ZERO128(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
+ AV_ZERO64(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
}
if(USES_LIST(left_type[0], list)){
const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
- AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]);
- AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]);
+ AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]);
+ AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]);
}else{
- AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 0*8]);
- AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 1*8]);
+ AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 0*8]);
+ AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 1*8]);
}
if(USES_LIST(left_type[1], list)){
const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
- AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]);
- AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]);
+ AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]);
+ AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]);
}else{
- AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 2*8]);
- AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 3*8]);
+ AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 2*8]);
+ AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 3*8]);
}
- AV_ZERO32(h->mvd_cache [list][scan8[5 ]+1]);
- AV_ZERO32(h->mvd_cache [list][scan8[7 ]+1]);
- AV_ZERO32(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else)
- AV_ZERO32(h->mvd_cache [list][scan8[4 ]]);
- AV_ZERO32(h->mvd_cache [list][scan8[12]]);
-
+ AV_ZERO16(h->mvd_cache [list][scan8[5 ]+1]);
+ AV_ZERO16(h->mvd_cache [list][scan8[7 ]+1]);
+ AV_ZERO16(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else)
+ AV_ZERO16(h->mvd_cache [list][scan8[4 ]]);
+ AV_ZERO16(h->mvd_cache [list][scan8[12]]);
+STOP_TIMER("mvdload")
if(h->slice_type_nos == FF_B_TYPE){
fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);
@@ -1414,13 +1442,13 @@
AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
}
if( CABAC ) {
- int16_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
- int16_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
+ uint8_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
+ uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
if(IS_SKIP(mb_type))
- fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 4);
+ fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 2);
else
for(y=0; y<4; y++){
- AV_COPY128(mvd_dst + y*b_stride, mvd_src + 8*y);
+ AV_COPY64(mvd_dst + y*b_stride, mvd_src + 8*y);
}
}
Index: libavutil/intreadwrite.h
===================================================================
--- libavutil/intreadwrite.h (revision 21999)
+++ libavutil/intreadwrite.h (working copy)
@@ -465,6 +465,10 @@
#define AV_COPY(n, d, s) \
(((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n)
+#ifndef AV_COPY16
+# define AV_COPY16(d, s) AV_COPY(16, d, s)
+#endif
+
#ifndef AV_COPY32
# define AV_COPY32(d, s) AV_COPY(32, d, s)
#endif
@@ -489,6 +493,10 @@
#define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0)
+#ifndef AV_ZERO16
+# define AV_ZERO16(d) AV_ZERO(16, d)
+#endif
+
#ifndef AV_ZERO32
# define AV_ZERO32(d) AV_ZERO(32, d)
#endif
Index: libavcodec/rectangle.h
===================================================================
--- libavcodec/rectangle.h (revision 21999)
+++ libavcodec/rectangle.h (working copy)
@@ -58,7 +58,7 @@
*(uint16_t*)(p + 2*stride)= v;
*(uint16_t*)(p + 3*stride)= v;
}else if(w==4){
- const uint32_t v= size==4 ? val : val*0x01010101;
+ const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101;
*(uint32_t*)(p + 0*stride)= v;
if(h==1) return;
*(uint32_t*)(p + 1*stride)= v;
@@ -68,7 +68,7 @@
}else if(w==8){
//gcc can't optimize 64bit math on x86_32
#if HAVE_FAST_64BIT
- const uint64_t v= val*0x0100000001ULL;
+ const uint64_t v= size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL;
*(uint64_t*)(p + 0*stride)= v;
if(h==1) return;
*(uint64_t*)(p + 1*stride)= v;
@@ -87,34 +87,35 @@
*(uint64_t*)(p + 0+3*stride)= v;
*(uint64_t*)(p + 8+3*stride)= v;
#else
- *(uint32_t*)(p + 0+0*stride)= val;
- *(uint32_t*)(p + 4+0*stride)= val;
+ const uint32_t v= size==2 ? val*0x00010001 : val*0x01010101;
+ *(uint32_t*)(p + 0+0*stride)= v;
+ *(uint32_t*)(p + 4+0*stride)= v;
if(h==1) return;
- *(uint32_t*)(p + 0+1*stride)= val;
- *(uint32_t*)(p + 4+1*stride)= val;
+ *(uint32_t*)(p + 0+1*stride)= v;
+ *(uint32_t*)(p + 4+1*stride)= v;
if(h==2) return;
- *(uint32_t*)(p + 0+2*stride)= val;
- *(uint32_t*)(p + 4+2*stride)= val;
- *(uint32_t*)(p + 0+3*stride)= val;
- *(uint32_t*)(p + 4+3*stride)= val;
+ *(uint32_t*)(p + 0+2*stride)= v;
+ *(uint32_t*)(p + 4+2*stride)= v;
+ *(uint32_t*)(p + 0+3*stride)= v;
+ *(uint32_t*)(p + 4+3*stride)= v;
}else if(w==16){
- *(uint32_t*)(p + 0+0*stride)= val;
- *(uint32_t*)(p + 4+0*stride)= val;
- *(uint32_t*)(p + 8+0*stride)= val;
- *(uint32_t*)(p +12+0*stride)= val;
- *(uint32_t*)(p + 0+1*stride)= val;
- *(uint32_t*)(p + 4+1*stride)= val;
- *(uint32_t*)(p + 8+1*stride)= val;
- *(uint32_t*)(p +12+1*stride)= val;
+ *(uint32_t*)(p + 0+0*stride)= v;
+ *(uint32_t*)(p + 4+0*stride)= v;
+ *(uint32_t*)(p + 8+0*stride)= v;
+ *(uint32_t*)(p +12+0*stride)= v;
+ *(uint32_t*)(p + 0+1*stride)= v;
+ *(uint32_t*)(p + 4+1*stride)= v;
+ *(uint32_t*)(p + 8+1*stride)= v;
+ *(uint32_t*)(p +12+1*stride)= v;
if(h==2) return;
- *(uint32_t*)(p + 0+2*stride)= val;
- *(uint32_t*)(p + 4+2*stride)= val;
- *(uint32_t*)(p + 8+2*stride)= val;
- *(uint32_t*)(p +12+2*stride)= val;
- *(uint32_t*)(p + 0+3*stride)= val;
- *(uint32_t*)(p + 4+3*stride)= val;
- *(uint32_t*)(p + 8+3*stride)= val;
- *(uint32_t*)(p +12+3*stride)= val;
+ *(uint32_t*)(p + 0+2*stride)= v;
+ *(uint32_t*)(p + 4+2*stride)= v;
+ *(uint32_t*)(p + 8+2*stride)= v;
+ *(uint32_t*)(p +12+2*stride)= v;
+ *(uint32_t*)(p + 0+3*stride)= v;
+ *(uint32_t*)(p + 4+3*stride)= v;
+ *(uint32_t*)(p + 8+3*stride)= v;
+ *(uint32_t*)(p +12+3*stride)= v;
#endif
}else
assert(0);
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Awnsering whenever a program halts or runs forever is
On a turing machine, in general impossible (turings halting problem).
On any real computer, always possible as a real computer has a finite number
of states N, and will either halt in less than N cycles or never halt.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20100224/226c6538/attachment.pgp>
More information about the ffmpeg-devel
mailing list