[FFmpeg-devel] [PATCH] Lagarith range decoder.

Sat Sep 5 23:03:15 CEST 2009

Fixed a rare rounding bug. 1% slower than previous patch.

--Loren Merritt
-------------- next part --------------
commit 8c34a188a421a7c03b3a36d1595397e643b2a8dc
Author: Loren Merritt <pengvado at akuvian.org>
Date:   Sat Sep 5 13:59:36 2009 +0000

    extend ff_inverse[], and fix its documentation

diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 758f613..6dfaff6 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -110,8 +110,8 @@ const uint8_t ff_alternate_vertical_scan[64] = {
     38, 46, 54, 62, 39, 47, 55, 63,
 };
 
-/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
-const uint32_t ff_inverse[256]={
+/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256 */
+const uint32_t ff_inverse[257]={
          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
@@ -144,6 +144,7 @@ const uint32_t ff_inverse[256]={
   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
+  16777216
 };
 
 /* Input permutation for the simple_idct_mmx */
diff --git a/libavutil/internal.h b/libavutil/internal.h
index 38bca42..0e37a94 100644
--- a/libavutil/internal.h
+++ b/libavutil/internal.h
@@ -124,7 +124,7 @@
 
 /* math */
 
-extern const uint32_t ff_inverse[256];
+extern const uint32_t ff_inverse[257];
 
 #if ARCH_X86
 #    define FASTDIV(a,b) \
-------------- next part --------------
diff --git a/libavcodec/lagarithrac.h b/libavcodec/lagarithrac.h
index 861e91f..2088683 100644
--- a/libavcodec/lagarithrac.h
+++ b/libavcodec/lagarithrac.h
@@ -46,7 +46,7 @@ typedef struct lag_rac {
     uint8_t *bytestream_end;    /*!< End position of input bytestream. */
 
     int prob[258];              /*!< Table of cumulative probability for each symbol. */
-    int range_hash[256];        /*!< Hash table mapping upper byte to approximate symbol. */
+    uint8_t range_hash[256];    /*!< Hash table mapping upper byte to approximate symbol. */
 } lag_rac;
 
 void lag_rac_init(lag_rac *l, GetBitContext *gb, int length);
@@ -62,27 +62,40 @@ static inline void lag_rac_refill(lag_rac *l)
     }
 }
 
-/* TODO: Optimize */
 // decodes a byte
 static inline int lag_get_rac(lag_rac *l)
 {
-    unsigned range_scaled, low_scaled;
-    int val = 0;
+    unsigned range_scaled, low_scaled, div;
+    int val;
+    uint8_t shift;
 
     lag_rac_refill(l);
 
     range_scaled = l->range >> l->scale;
-    low_scaled = l->low / range_scaled;
-
-    if (low_scaled < l->prob[255]) {
-        val = l->range_hash[low_scaled >> l->hash_shift];
-        while (l->prob[val + 1] <= low_scaled)
-            val++;
 
-        l->range = range_scaled * (l->prob[val + 1] - l->prob[val]);
-    } else {
+    if (l->low >= range_scaled * l->prob[255]) {
         val = 255;
         l->range -= range_scaled * l->prob[255];
+    } else {
+        // val = 0 is frequent enough to deserve a shortcut
+        if (l->low < range_scaled * l->prob[1]) {
+            val = 0;
+        } else {
+            // FIXME make av_log2 fast so it's usable here
+            shift = __builtin_clz(range_scaled) - 1;
+            div = ((range_scaled << shift) + (1<<23)-1) >> 23;
+            // low>>24 ensures that any cases too big for exact FASTDIV are under- rather than over-estimated
+            low_scaled = FASTDIV(l->low - (l->low>>24), div);
+            shift -= 23 + l->hash_shift;
+            shift &= 31;
+            low_scaled = (low_scaled<<shift) | (low_scaled>>(32-shift));
+            // low_scaled is now a lower bound of low/range_scaled
+            val = l->range_hash[(uint8_t)low_scaled];
+            while (l->low >= range_scaled * l->prob[val+1])
+                val++;
+        }
+
+        l->range = range_scaled * (l->prob[val+1] - l->prob[val]);
     }
 
     l->low -= range_scaled * l->prob[val];