[FFmpeg-devel] [FFmpeg-devel-irc] IRC log for 2010-02-19

Wed Feb 24 13:50:42 CET 2010

On Sun, Feb 21, 2010 at 03:06:43PM -0800, Jason Garrett-Glaser wrote:
> On Sun, Feb 21, 2010 at 6:57 AM, Michael Niedermayer <michaelni at gmx.at> wrote:
> > On Sat, Feb 20, 2010 at 12:00:54AM +0000, irc at mansr.com wrote:
> >> [00:00:16] <mru> if speed matters you should use asm
> >> [00:00:27] <Dark_Shikari> we only have inline asm on x86 atm
> >> [00:00:47] <mru> sometimes inline asm is the best solution
> >> [00:01:17] <Dark_Shikari> hmm. michael's idea seems to hurt in x264.
> >> [00:02:02] <Dark_Shikari> in fact, michael's idea would hurt in ffmpeg if ffmpeg had inline SIMD like x264 did
> >> [00:02:05] <Dark_Shikari> for that code
> >> [00:02:11] <Dark_Shikari> we use simd for the following
> >> [00:02:12] <Dark_Shikari> MIN(((x+28)*2184)>>16,2) = (x>2) + (x>32)
> >> [00:02:20] <Dark_Shikari> on two values at once
> >> [00:02:28] <Dark_Shikari> left side is simd, right side is C
> >
> > any volunteers who would send a patch?
> 
> Note that since, I've changed that code locally due to some
> inspiration from your patch ;)
> 
> Here's the current asm, which calculates (x>2)+(x>32) for two values
> at once.  I don't think it's much better than C anymore; the main
> advantage before was that it saved 2 abs() calls, but your idea
> eliminates the need for that.
> 
>     static const uint64_t pb_2    = 0x0202020202020202ULL;
>     static const uint64_t pb_32   = 0x2020202020202020ULL;
>     int amvd;
>     asm(
>         "movd         %1, %%mm0 \n"
>         "movd         %2, %%mm1 \n"
>         "paddb     %%mm1, %%mm0 \n"
>         "pxor      %%mm2, %%mm2 \n"
>         "movq  %%mm0, %%mm1 \n"
>         "pcmpgtb %3, %%mm0 \n"
>         "pcmpgtb  %4, %%mm1 \n"
>         "psubb      %%mm0, %%mm2 \n"
>         "psubb      %%mm1, %%mm2 \n"
>         "movd      %%mm2, %0    \n"
>         :"=r"(amvd)
>         :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
>          "m"(pb_2),"m"(pb_32)
>     );
> 
> Note how the input is bytes (!).  Here's the trick: MVD values only
> have to be 0 to 33; any larger value tells us nothing.  Maybe there's
> some scaling that goes on with MBAFF, but even then it's only 0-65.
> As a result, you can store MVD values as uint8_ts, saving enormous
> amounts of memory and cache and making fill_rectangle faster.
> Obviously this requires a little bit of extra clipping, but my
> benchmarks in x264 show that it's worth it there.
> 
> This change is probably vastly more useful than the above asm (which I
> make available under LGPL in case anyone cares, but it's probably
> near-useless once the other changes are done).

I tried to switch from 16 -> 8 bit
mvd1616 goes from 283cycles -> 292cycles and mvload 108->114cycles
mvd88 stays at 354

i did not look at the gcc generated asm (did just eat)

patch below
if someone can get it faster thats welcome

Index: libavcodec/h264.c
===================================================================

--- libavcodec/h264.c	(revision 21999)
+++ libavcodec/h264.c	(working copy)
@@ -756,8 +762,8 @@
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
 
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint8_t), fail);
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint8_t), fail);
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail)
 
Index: libavcodec/h264_cabac.c
===================================================================
--- libavcodec/h264_cabac.c	(revision 21999)
+++ libavcodec/h264_cabac.c	(working copy)
@@ -1418,7 +1427,7 @@
             for(i=0; i<4; i++){
                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
                 if(IS_DIRECT(h->sub_mb_type[i])){
-                    fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
+                    fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2);
                     continue;
                 }
 
@@ -1430,15 +1439,18 @@
                         int mx, my;
                         const int index= 4*i + block_width*j;
                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
-                        int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
+                        uint8_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
 
+START_TIMER
                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
 
                         mpx= abs(mpx-mx);
                         mpy= abs(mpy-my);
+                        if(mpx>70) mpx=70;
+                        if(mpy>70) mpy=70;
                         if(IS_SUB_8X8(sub_mb_type)){
                             mv_cache[ 1 ][0]=
                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
@@ -1467,19 +1479,20 @@
 
                         mvd_cache[ 0 ][0]= mpx;
                         mvd_cache[ 0 ][1]= mpy;
+STOP_TIMER("mvd88")
                     }
                 }else{
                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
-                    p[0] = p[1] = p[8] = p[9] = 0;
-                    pd[0]= pd[1]= pd[8]= pd[9]= 0;
+                    p[0] = p[1] = p[8] = p[9] = 0; //64bitwrite
+                    pd[0]= pd[4]= 0;
                 }
             }
         }
     } else if( IS_DIRECT(mb_type) ) {
         ff_h264_pred_direct_motion(h, &mb_type);
-        fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
-        fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+        fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2);
+        fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2);
         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
     } else {
         int list, mx, my, i, mpx, mpy;
@@ -1502,12 +1515,17 @@
             for(list=0; list<h->list_count; list++){
                 if(IS_DIR(mb_type, 0, list)){
                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
-
+START_TIMER
                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
 
-                    fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(abs(mx-mpx),abs(my-mpy)), 4);
+                    mpx= abs(mx-mpx);
+                    mpy= abs(my-mpy);
+                    if(mpx>70) mpx=70;
+                    if(mpy>70) mpy=70;
+                    fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2);
+STOP_TIMER("mvd1616")
                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
                 }else
                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
@@ -1539,10 +1557,14 @@
                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
 
-                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(abs(mx-mpx),abs(my-mpy)), 4);
+                        mpx= abs(mx-mpx);
+                        mpy= abs(my-mpy);
+                        if(mpx>70) mpx=70;
+                        if(mpy>70) mpy=70;
+                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2);
                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
                     }else{
-                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
+                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2);
                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
                     }
                 }
@@ -1574,10 +1596,14 @@
                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
 
                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
-                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(abs(mx-mpx),abs(my-mpy)), 4);
+                        mpx= abs(mx-mpx);
+                        mpy= abs(my-mpy);
+                        if(mpx>70) mpx=70;
+                        if(mpy>70) mpy=70;
+                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2);
                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
                     }else{
-                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
+                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2);
                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
                     }
                 }
Index: libavcodec/h264.h
===================================================================
--- libavcodec/h264.h	(revision 22013)
+++ libavcodec/h264.h	(working copy)
@@ -486,8 +499,8 @@
     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
     uint8_t     *chroma_pred_mode_table;
     int         last_qscale_diff;
-    int16_t     (*mvd_table[2])[2];
-    DECLARE_ALIGNED_16(int16_t, mvd_cache)[2][5*8][2];
+    uint8_t     (*mvd_table[2])[2];
+    DECLARE_ALIGNED_16(uint8_t, mvd_cache)[2][5*8][2];
     uint8_t     *direct_table;
     uint8_t     direct_cache[5*8];
 
@@ -732,6 +745,14 @@
 #endif
 }
 
+static av_always_inline uint16_t pack8to16(int a, int b){
+#if HAVE_BIGENDIAN
+   return (b&0xFF) + (a<<8);
+#else
+   return (a&0xFF) + (b<<8);
+#endif
+}
+
 /**
  * gets the chroma qp.
  */
@@ -1058,34 +1084,35 @@
 
             if( CABAC ) {
                 /* XXX beurk, Load mvd */
+START_TIMER
                 if(USES_LIST(top_type, list)){
                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
-                    AV_COPY128(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
+                    AV_COPY64(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
                 }else{
-                    AV_ZERO128(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
+                    AV_ZERO64(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
                 }
                 if(USES_LIST(left_type[0], list)){
                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
-                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]);
-                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]);
                 }else{
-                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 0*8]);
-                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 1*8]);
+                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 0*8]);
+                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 1*8]);
                 }
                 if(USES_LIST(left_type[1], list)){
                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
-                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]);
-                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]);
+                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]);
                 }else{
-                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 2*8]);
-                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 3*8]);
+                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 2*8]);
+                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 3*8]);
                 }
-                AV_ZERO32(h->mvd_cache [list][scan8[5 ]+1]);
-                AV_ZERO32(h->mvd_cache [list][scan8[7 ]+1]);
-                AV_ZERO32(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else)
-                AV_ZERO32(h->mvd_cache [list][scan8[4 ]]);
-                AV_ZERO32(h->mvd_cache [list][scan8[12]]);
-
+                AV_ZERO16(h->mvd_cache [list][scan8[5 ]+1]);
+                AV_ZERO16(h->mvd_cache [list][scan8[7 ]+1]);
+                AV_ZERO16(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else)
+                AV_ZERO16(h->mvd_cache [list][scan8[4 ]]);
+                AV_ZERO16(h->mvd_cache [list][scan8[12]]);
+STOP_TIMER("mvdload")
                 if(h->slice_type_nos == FF_B_TYPE){
                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);
 
@@ -1414,13 +1442,13 @@
             AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
         }
         if( CABAC ) {
-            int16_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
-            int16_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
+            uint8_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
+            uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
             if(IS_SKIP(mb_type))
-                fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 4);
+                fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 2);
             else
             for(y=0; y<4; y++){
-                AV_COPY128(mvd_dst + y*b_stride, mvd_src + 8*y);
+                AV_COPY64(mvd_dst + y*b_stride, mvd_src + 8*y);
             }
         }
 
Index: libavutil/intreadwrite.h
===================================================================
--- libavutil/intreadwrite.h	(revision 21999)
+++ libavutil/intreadwrite.h	(working copy)
@@ -465,6 +465,10 @@
 #define AV_COPY(n, d, s) \
     (((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n)
 
+#ifndef AV_COPY16
+#   define AV_COPY16(d, s) AV_COPY(16, d, s)
+#endif
+
 #ifndef AV_COPY32
 #   define AV_COPY32(d, s) AV_COPY(32, d, s)
 #endif
@@ -489,6 +493,10 @@
 
 #define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0)
 
+#ifndef AV_ZERO16
+#   define AV_ZERO16(d) AV_ZERO(16, d)
+#endif
+
 #ifndef AV_ZERO32
 #   define AV_ZERO32(d) AV_ZERO(32, d)
 #endif
Index: libavcodec/rectangle.h
===================================================================
--- libavcodec/rectangle.h	(revision 21999)
+++ libavcodec/rectangle.h	(working copy)
@@ -58,7 +58,7 @@
         *(uint16_t*)(p + 2*stride)= v;
         *(uint16_t*)(p + 3*stride)= v;
     }else if(w==4){
-        const uint32_t v= size==4 ? val : val*0x01010101;
+        const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101;
         *(uint32_t*)(p + 0*stride)= v;
         if(h==1) return;
         *(uint32_t*)(p + 1*stride)= v;
@@ -68,7 +68,7 @@
     }else if(w==8){
     //gcc can't optimize 64bit math on x86_32
 #if HAVE_FAST_64BIT
-        const uint64_t v= val*0x0100000001ULL;
+        const uint64_t v=  size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL;
         *(uint64_t*)(p + 0*stride)= v;
         if(h==1) return;
         *(uint64_t*)(p + 1*stride)= v;
@@ -87,34 +87,35 @@
         *(uint64_t*)(p + 0+3*stride)= v;
         *(uint64_t*)(p + 8+3*stride)= v;
 #else
-        *(uint32_t*)(p + 0+0*stride)= val;
-        *(uint32_t*)(p + 4+0*stride)= val;
+        const uint32_t v= size==2 ? val*0x00010001 : val*0x01010101;
+        *(uint32_t*)(p + 0+0*stride)= v;
+        *(uint32_t*)(p + 4+0*stride)= v;
         if(h==1) return;
-        *(uint32_t*)(p + 0+1*stride)= val;
-        *(uint32_t*)(p + 4+1*stride)= val;
+        *(uint32_t*)(p + 0+1*stride)= v;
+        *(uint32_t*)(p + 4+1*stride)= v;
         if(h==2) return;
-        *(uint32_t*)(p + 0+2*stride)= val;
-        *(uint32_t*)(p + 4+2*stride)= val;
-        *(uint32_t*)(p + 0+3*stride)= val;
-        *(uint32_t*)(p + 4+3*stride)= val;
+        *(uint32_t*)(p + 0+2*stride)= v;
+        *(uint32_t*)(p + 4+2*stride)= v;
+        *(uint32_t*)(p + 0+3*stride)= v;
+        *(uint32_t*)(p + 4+3*stride)= v;
     }else if(w==16){
-        *(uint32_t*)(p + 0+0*stride)= val;
-        *(uint32_t*)(p + 4+0*stride)= val;
-        *(uint32_t*)(p + 8+0*stride)= val;
-        *(uint32_t*)(p +12+0*stride)= val;
-        *(uint32_t*)(p + 0+1*stride)= val;
-        *(uint32_t*)(p + 4+1*stride)= val;
-        *(uint32_t*)(p + 8+1*stride)= val;
-        *(uint32_t*)(p +12+1*stride)= val;
+        *(uint32_t*)(p + 0+0*stride)= v;
+        *(uint32_t*)(p + 4+0*stride)= v;
+        *(uint32_t*)(p + 8+0*stride)= v;
+        *(uint32_t*)(p +12+0*stride)= v;
+        *(uint32_t*)(p + 0+1*stride)= v;
+        *(uint32_t*)(p + 4+1*stride)= v;
+        *(uint32_t*)(p + 8+1*stride)= v;
+        *(uint32_t*)(p +12+1*stride)= v;
         if(h==2) return;
-        *(uint32_t*)(p + 0+2*stride)= val;
-        *(uint32_t*)(p + 4+2*stride)= val;
-        *(uint32_t*)(p + 8+2*stride)= val;
-        *(uint32_t*)(p +12+2*stride)= val;
-        *(uint32_t*)(p + 0+3*stride)= val;
-        *(uint32_t*)(p + 4+3*stride)= val;
-        *(uint32_t*)(p + 8+3*stride)= val;
-        *(uint32_t*)(p +12+3*stride)= val;
+        *(uint32_t*)(p + 0+2*stride)= v;
+        *(uint32_t*)(p + 4+2*stride)= v;
+        *(uint32_t*)(p + 8+2*stride)= v;
+        *(uint32_t*)(p +12+2*stride)= v;
+        *(uint32_t*)(p + 0+3*stride)= v;
+        *(uint32_t*)(p + 4+3*stride)= v;
+        *(uint32_t*)(p + 8+3*stride)= v;
+        *(uint32_t*)(p +12+3*stride)= v;
 #endif
     }else
         assert(0);

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Awnsering whenever a program halts or runs forever is
On a turing machine, in general impossible (turings halting problem).
On any real computer, always possible as a real computer has a finite number
of states N, and will either halt in less than N cycles or never halt.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20100224/226c6538/attachment.pgp>