[Ffmpeg-devel] H.264 encoder

Thu Oct 5 18:11:19 CEST 2006

Hi,

On Wed, Oct 04, 2006 at 01:11:07PM +0200, Michael Niedermayer wrote:
> > > [...]
> > >
> > >
> > >> +    for (i = 0 ; i < rbsplen ; i++)
> > >> +    {
> > >> +        if (i + 2 < rbsplen && (rbsp[i] == 0 && rbsp[i+1] == 0 && rbsp[i+2] < 4))
> > >> +        {
> > >> +            dest[destpos++] = rbsp[i++];
> > >> +            dest[destpos++] = rbsp[i];
> > >> +            dest[destpos++] = 0x03; // emulation prevention byte
> > >> +        }
> > >> +        else
> > >> +            dest[destpos++] = rbsp[i];
> > >> +    }
> > >>
> > >
> > > instead of a loop which checks every byte, you could check just every 2nd
> > > also see encode_nal() un h264.c maybe you can use some parts from that
Fixed by borrowing code from encode_nal(). So, now there's a first run over
the bytes looking for any occurrence of the to be escaped byte sequence. If
not there's a copy, if there is such an occurrence, the previous and slower
method is used.

I've attached an updated patch, with this issue fixed. The patch also removes
encode_nal() as it is not used anywhere else. Hope that's okay :)

I haven't changed the line length yet as I was not sure what to change it to
(see my next e-mail).

With friendly regards,
Takis
-------------- next part --------------

diff --git a/Changelog b/Changelog
index 1bcfe36..b9e1917 100644
--- a/Changelog
+++ b/Changelog
@@ -59,6 +59,7 @@ version <next>
 - VP5 video decoder
 - VP6 video decoder
 - WavPack lossless audio decoder
+- Native H.264 encoder
 
 version 0.4.9-pre1:
 
diff --git a/doc/ffmpeg-doc.texi b/doc/ffmpeg-doc.texi
index c41807d..76644eb 100644
--- a/doc/ffmpeg-doc.texi
+++ b/doc/ffmpeg-doc.texi
@@ -772,7 +772,7 @@ following image formats are supported:
 @item WMV8                   @tab  X  @tab  X @tab not completely working
 @item H.261                  @tab  X  @tab  X
 @item H.263(+)               @tab  X  @tab  X @tab also known as RealVideo 1.0
- at item H.264                  @tab     @tab  X
+ at item H.264                  @tab  X  @tab  X
 @item RealVideo 1.0          @tab  X  @tab  X
 @item RealVideo 2.0          @tab  X  @tab  X
 @item MJPEG                  @tab  X  @tab  X
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index fbf8e0b..74b90f9 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -87,6 +87,7 @@ OBJS-$(CONFIG_FRAPS_DECODER)           +
 OBJS-$(CONFIG_H261_DECODER)            += h261.o
 OBJS-$(CONFIG_H261_ENCODER)            += h261.o
 OBJS-$(CONFIG_H264_DECODER)            += h264.o
+OBJS-$(CONFIG_H264_ENCODER)            += h264enc.o h264cavlc.o h264dsp.o
 OBJS-$(CONFIG_HUFFYUV_DECODER)         += huffyuv.o
 OBJS-$(CONFIG_HUFFYUV_ENCODER)         += huffyuv.o
 OBJS-$(CONFIG_IDCIN_DECODER)           += idcinvideo.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index fdd80fc..d575908 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -75,7 +75,7 @@ #ifdef CONFIG_MPEG1VIDEO_ENCODER
     register_avcodec(&mpeg1video_encoder);
 #endif //CONFIG_MPEG1VIDEO_ENCODER
 #ifdef CONFIG_H264_ENCODER
-//    register_avcodec(&h264_encoder);
+    register_avcodec(&h264_encoder);
 #endif //CONFIG_H264_ENCODER
 #ifdef CONFIG_MPEG2VIDEO_ENCODER
     register_avcodec(&mpeg2video_encoder);
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index ad7c776..db9e8a3 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -142,6 +142,7 @@ enum CodecID {
     CODEC_ID_VP5,
     CODEC_ID_VP6,
     CODEC_ID_VP6F,
+    CODEC_ID_FFH264,
 
     /* various pcm "codecs" */
     CODEC_ID_PCM_S16LE= 0x10000,
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index ae5902f..1e3ce7f 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2624,6 +2624,11 @@ void ff_put_vc1_mspel_mc00_c(uint8_t *ds
 }
 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
 
+#if defined(CONFIG_H264_ENCODER)
+/* H264 specific */
+void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx);
+#endif /* CONFIG_H264_ENCODER */
+
 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
     uint8_t *cm = cropTbl + MAX_NEG_CROP;
     int i;
@@ -4081,6 +4086,9 @@ #endif
 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
     ff_vc1dsp_init(c,avctx);
 #endif
+#if defined(CONFIG_H264_ENCODER)
+    ff_h264dsp_init(c,avctx);
+#endif
 
     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 34d91ab..1038e5d 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -57,6 +57,13 @@ void ff_h264_idct8_dc_add_c(uint8_t *dst
 void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
 void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
+void ff_h264_dct_c(DCTELEM inblock[4][4], DCTELEM outblock[4][4]);
+void ff_h264_hadamard_mult4x4_c(DCTELEM Y[4][4]);
+void ff_h264_transform_dct_quant_c(int16_t block[4][4], int QP, int dontscaleDC);
+void ff_h264_hadamard_quant_4x4_c(DCTELEM Y[4][4], int QP);
+void ff_h264_hadamard_quant_2x2_c(int16_t Y[2][2], int QP);
+void ff_h264_hadamard_invquant_4x4_c(DCTELEM Y[4][4], int QP);
+void ff_h264_transform_inverse_quant_dct_add_c(int16_t block[4][4], int QP, int dontscaleDC, uint8_t *dst, int stride);
 
 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
                               const float *src2, int src3, int blocksize, int step);
@@ -376,10 +383,19 @@ #define FF_PARTTRANS_IDCT_PERM 5
 #define BASIS_SHIFT 16
 #define RECON_SHIFT 6
 
+    /* h264 functions */
     void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride);
     void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
     void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
     void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*h264_dct)(DCTELEM inblock[4][4], DCTELEM outblock[4][4]);
+    void (*h264_idct_notranspose_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*h264_hadamard_mult4x4)(DCTELEM Y[4][4]);
+    void (*h264_hadamard_quant_2x2)(int16_t Y[2][2], int QP);
+    void (*h264_hadamard_quant_4x4)(DCTELEM Y[4][4], int QP);
+    void (*h264_hadamard_invquant_4x4)(DCTELEM Y[4][4], int QP);
+    void (*h264_transform_dct_quant)(int16_t block[4][4], int QP, int dontscaleDC);
+    void (*h264_transform_inverse_quant_dct_add)(int16_t block[4][4], int QP, int dontscaleDC, uint8_t *dst, int stride);
 
     /* snow wavelet */
     void (*vertical_compose97i)(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 8602276..e0e3499 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -163,20 +163,6 @@ typedef struct H264Context{
     MpegEncContext s;
     int nal_ref_idc;
     int nal_unit_type;
-#define NAL_SLICE                1
-#define NAL_DPA                  2
-#define NAL_DPB                  3
-#define NAL_DPC                  4
-#define NAL_IDR_SLICE            5
-#define NAL_SEI                  6
-#define NAL_SPS                  7
-#define NAL_PPS                  8
-#define NAL_AUD                  9
-#define NAL_END_SEQUENCE        10
-#define NAL_END_STREAM          11
-#define NAL_FILLER_DATA         12
-#define NAL_SPS_EXT             13
-#define NAL_AUXILIARY_SLICE     19
     uint8_t *rbsp_buffer;
     unsigned int rbsp_buffer_size;
 
@@ -420,13 +406,22 @@ #else
 #endif
 }
 
+const uint8_t rem6[52]={
+0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+};
+
+const uint8_t div6[52]={
+0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
+};
+
+
 /**
  * fill a rectangle.
  * @param h height of the rectangle, should be a constant
  * @param w width of the rectangle, should be a constant
  * @param size the size of val (1 or 4), should be a constant
  */
-static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
+always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
     uint8_t *p= (uint8_t*)vp;
     assert(size==1 || size==4);
     assert(w<=4);
@@ -1806,70 +1801,7 @@ #endif
     return dst;
 }
 
-#if 0
-/**
- * @param src the data which should be escaped
- * @param dst the target buffer, dst+1 == src is allowed as a special case
- * @param length the length of the src data
- * @param dst_length the length of the dst array
- * @returns length of escaped data in bytes or -1 if an error occured
- */
-static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
-    int i, escape_count, si, di;
-    uint8_t *temp;
-
-    assert(length>=0);
-    assert(dst_length>0);
-
-    dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
-
-    if(length==0) return 1;
-
-    escape_count= 0;
-    for(i=0; i<length; i+=2){
-        if(src[i]) continue;
-        if(i>0 && src[i-1]==0)
-            i--;
-        if(i+2<length && src[i+1]==0 && src[i+2]<=3){
-            escape_count++;
-            i+=2;
-        }
-    }
-
-    if(escape_count==0){
-        if(dst+1 != src)
-            memcpy(dst+1, src, length);
-        return length + 1;
-    }
-
-    if(length + escape_count + 1> dst_length)
-        return -1;
-
-    //this should be damn rare (hopefully)
-
-    h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
-    temp= h->rbsp_buffer;
-//printf("encoding esc\n");
-
-    si= 0;
-    di= 0;
-    while(si < length){
-        if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
-            temp[di++]= 0; si++;
-            temp[di++]= 0; si++;
-            temp[di++]= 3;
-            temp[di++]= src[si++];
-        }
-        else
-            temp[di++]= src[si++];
-    }
-    memcpy(dst+1, temp, length+escape_count);
-
-    assert(di == length+escape_count);
-
-    return di + 1;
-}
-
+#if 1
 /**
  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
  */
@@ -2033,42 +1965,6 @@ static inline int get_chroma_qp(int chro
     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
 }
 
-
-#if 0
-static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
-    int i;
-    //FIXME try int temp instead of block
-
-    for(i=0; i<4; i++){
-        const int d0= src1[0 + i*stride] - src2[0 + i*stride];
-        const int d1= src1[1 + i*stride] - src2[1 + i*stride];
-        const int d2= src1[2 + i*stride] - src2[2 + i*stride];
-        const int d3= src1[3 + i*stride] - src2[3 + i*stride];
-        const int z0= d0 + d3;
-        const int z3= d0 - d3;
-        const int z1= d1 + d2;
-        const int z2= d1 - d2;
-
-        block[0 + 4*i]=   z0 +   z1;
-        block[1 + 4*i]= 2*z3 +   z2;
-        block[2 + 4*i]=   z0 -   z1;
-        block[3 + 4*i]=   z3 - 2*z2;
-    }
-
-    for(i=0; i<4; i++){
-        const int z0= block[0*4 + i] + block[3*4 + i];
-        const int z3= block[0*4 + i] - block[3*4 + i];
-        const int z1= block[1*4 + i] + block[2*4 + i];
-        const int z2= block[1*4 + i] - block[2*4 + i];
-
-        block[0*4 + i]=   z0 +   z1;
-        block[1*4 + i]= 2*z3 +   z2;
-        block[2*4 + i]=   z0 -   z1;
-        block[3*4 + i]=   z3 - 2*z2;
-    }
-}
-#endif
-
 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
@@ -2355,7 +2251,7 @@ static void pred4x4_horizontal_down_c(ui
     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
 }
 
-static void pred16x16_vertical_c(uint8_t *src, int stride){
+void pred16x16_vertical_c(uint8_t *src, int stride){
     int i;
     const uint32_t a= ((uint32_t*)(src-stride))[0];
     const uint32_t b= ((uint32_t*)(src-stride))[1];
@@ -2370,7 +2266,7 @@ static void pred16x16_vertical_c(uint8_t
     }
 }
 
-static void pred16x16_horizontal_c(uint8_t *src, int stride){
+void pred16x16_horizontal_c(uint8_t *src, int stride){
     int i;
 
     for(i=0; i<16; i++){
@@ -2381,7 +2277,7 @@ static void pred16x16_horizontal_c(uint8
     }
 }
 
-static void pred16x16_dc_c(uint8_t *src, int stride){
+void pred16x16_dc_c(uint8_t *src, int stride){
     int i, dc=0;
 
     for(i=0;i<16; i++){
@@ -2435,7 +2331,7 @@ static void pred16x16_top_dc_c(uint8_t *
     }
 }
 
-static void pred16x16_128_dc_c(uint8_t *src, int stride){
+void pred16x16_128_dc_c(uint8_t *src, int stride){
     int i;
 
     for(i=0; i<16; i++){
@@ -2486,11 +2382,11 @@ static inline void pred16x16_plane_compa
   }
 }
 
-static void pred16x16_plane_c(uint8_t *src, int stride){
+void pred16x16_plane_c(uint8_t *src, int stride){
     pred16x16_plane_compat_c(src, stride, 0);
 }
 
-static void pred8x8_vertical_c(uint8_t *src, int stride){
+void pred8x8_vertical_c(uint8_t *src, int stride){
     int i;
     const uint32_t a= ((uint32_t*)(src-stride))[0];
     const uint32_t b= ((uint32_t*)(src-stride))[1];
@@ -2501,7 +2397,7 @@ static void pred8x8_vertical_c(uint8_t *
     }
 }
 
-static void pred8x8_horizontal_c(uint8_t *src, int stride){
+void pred8x8_horizontal_c(uint8_t *src, int stride){
     int i;
 
     for(i=0; i<8; i++){
@@ -2510,7 +2406,7 @@ static void pred8x8_horizontal_c(uint8_t
     }
 }
 
-static void pred8x8_128_dc_c(uint8_t *src, int stride){
+void pred8x8_128_dc_c(uint8_t *src, int stride){
     int i;
 
     for(i=0; i<8; i++){
@@ -2564,7 +2460,7 @@ static void pred8x8_top_dc_c(uint8_t *sr
 }
 
 
-static void pred8x8_dc_c(uint8_t *src, int stride){
+void pred8x8_dc_c(uint8_t *src, int stride){
     int i;
     int dc0, dc1, dc2, dc3;
 
@@ -2589,7 +2485,7 @@ static void pred8x8_dc_c(uint8_t *src, i
     }
 }
 
-static void pred8x8_plane_c(uint8_t *src, int stride){
+void pred8x8_plane_c(uint8_t *src, int stride){
   int j, k;
   int a;
   uint8_t *cm = cropTbl + MAX_NEG_CROP;
diff --git a/libavcodec/h264cavlc.c b/libavcodec/h264cavlc.c
new file mode 100644
index 0000000..61d9637
--- /dev/null
+++ b/libavcodec/h264cavlc.c
@@ -0,0 +1,311 @@
+/*
+ * H.264 encoder
+ * Copyright (c) 2006 Expertisecentrum Digitale Media, UHasselt
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "common.h"
+#include "dsputil.h"
+#include "avcodec.h"
+#include "bitstream.h"
+#include "mpegvideo.h"
+#include "h264data.h"
+#include "h264encdata.h"
+
+//#define DEBUG_H264CAVLC
+
+static int length_table[7][4095];
+static int code_table[7][4095];
+
+void h264cavlc_generate_tables()
+{
+    int vlcnum, level;
+    for (vlcnum=0; vlcnum<7; vlcnum++)
+    {
+        for(level=-2047; level<2048; level++)
+        {
+            int sign = level < 0;
+            int levabs = ABS(level);
+            int len, inf;
+
+            if (vlcnum == 0)
+            {
+
+                if (levabs < 8)
+                {
+                    len = levabs * 2 + sign - 1;
+                    inf = 1;
+                }
+                else if (levabs < 8+8)
+                {
+                    len = 14 + 1 + 4;
+                    inf = (1 << 4) | ((levabs - 8) << 1) | sign;
+                }
+                else
+                {
+                    len = 14 + 2 + 12;
+                    inf = (0x1 << 12) | ((levabs - 16)<< 1) | sign;
+                }
+                length_table[vlcnum][level+2047] = len;
+                code_table[vlcnum][level+2047] = inf;
+            }
+            else
+            {
+                int iCodeword;
+                int iLength;
+                int shift = vlcnum-1;
+                int escape = (15<<shift)+1;
+                int numPrefix;
+                int sufmask = ~((0xffffffff)<<shift);
+                int suffix;
+
+                numPrefix = (levabs-1)>>shift;
+                suffix = (levabs-1)&sufmask;
+
+#ifdef DEBUG_H264CAVLC
+                printf("numPrefix %d | suffix %d | levabs %d | escape %d | sufmask %d | vlcnum %d | level %d | sign %d\n",
+                        numPrefix,suffix,levabs,escape,sufmask,vlcnum,level,sign);
+#endif // DEBUG_H264CAVLC
+                if (levabs < escape)
+                {
+                    iLength = numPrefix + vlcnum + 1;
+                    iCodeword = (1<<(shift+1))|(suffix<<1)|sign;
+                }
+                else
+                {
+                    iLength = 28;
+                    iCodeword = (1<<12)|((levabs-escape)<<1)|sign;
+                }
+                len = iLength;
+                inf = iCodeword;
+
+#ifdef DEBUG_H264CAVLC
+                printf("len %d | code %d\n",len,inf);
+#endif // DEBUG_H264CAVLC
+
+                length_table[vlcnum][level+2047] = len;
+                code_table[vlcnum][level+2047] = inf;
+            }
+        }
+    }
+}
+
+static inline void h264cavlc_encode_vlc_level(PutBitContext *b, int vlcnum, int16_t level)
+{
+    int16_t index;
+    index = level+2047;
+    put_bits(b,length_table[vlcnum][index],code_table[vlcnum][index]);
+#ifdef DEBUG_H264CAVLC
+//    av_log(NULL, AV_LOG_DEBUG, "Encoded level with number %d\n",code_table[vlcnum][index]);
+#endif
+}
+
+static inline void h264cavlc_encode_vlc_totalzeros(PutBitContext *b, int vlcnum, int total_zeros)
+{
+    put_bits(b,total_zeros_len[vlcnum][total_zeros],total_zeros_bits[vlcnum][total_zeros]);
+}
+
+static inline void h264cavlc_encode_vlc_run(PutBitContext *b, int vlcnum, int runbefore)
+{
+    put_bits(b,run_len[vlcnum][runbefore],run_bits[vlcnum][runbefore]);
+}
+
+static inline void h264cavlc_encode_vlc_coefftoken(PutBitContext *b, int lookup_table, int total_coeffs, int trailing_ones)
+{
+    put_bits(b,coeff_token_len[lookup_table][trailing_ones+total_coeffs*4],coeff_token_bits[lookup_table][trailing_ones+total_coeffs*4]);
+}
+
+static inline void h264cavlc_encode_vlc_coefftoken_chromadc(PutBitContext *b, int total_coeffs, int trailing_ones)
+{
+    put_bits(b,chroma_dc_coeff_token_len[trailing_ones + total_coeffs * 4],chroma_dc_coeff_token_bits[trailing_ones + total_coeffs * 4]);
+}
+
+static inline void h264cavlc_encode_vlc_totalzeros_chromadc(PutBitContext *b, int vlcnum, int value)
+{
+    if(vlcnum + value == 3) put_bits(b, value  , 0);
+    else                    put_bits(b, value+1, 1);
+}
+
+static inline int h264cavlc_get_lookup_table(int na, int nb)
+{
+    int nc = 0;
+    int8_t lookup_table[8] = {0, 0, 1, 1, 2, 2, 2, 2};
+
+    if (na >= 0 && nb >= 0)
+    {
+        nc = na+nb+1;
+        nc >>= 1;
+    }
+    else
+    {
+        if (na >= 0) // nB < 0
+            nc = na;
+        else if (nb >= 0) // nA < 0
+            nc = nb;
+    }
+
+    return (nc < 8) ? lookup_table[nc] : 3;
+}
+
+int h264cavlc_encode(PutBitContext *b, int16_t *coefficients, int len, int na, int nb, int is_chroma_dc)
+{
+    static const int8_t increment_vlcnum[6] = { 0, 3, 6, 12, 24, 48 };
+
+    int i, t;
+    int total_coeffs;
+    int trailing_ones;
+    int total_zeros;
+    int numlevels;
+    int16_t levels[256];
+    int16_t zeros[256];
+
+#ifdef DEBUG_H264CAVLC
+    for (i = 0 ; i < len ; i++)
+        av_log(NULL, AV_LOG_DEBUG, "%6d",coefficients[i]);
+    av_log(NULL, AV_LOG_DEBUG, "\n");
+#endif
+
+    // Count traling ones, total non-zero coefficients and the number of non-trailing zeros
+
+    total_coeffs = 0;
+    trailing_ones = 0;
+    total_zeros = 0; // For now, we'll count the number of zeros at the end
+    for (i = 0 ; i < len ; i++)
+    {
+        int16_t val = coefficients[i];
+        if (val != 0)
+        {
+            levels[total_coeffs] = val;
+            zeros[total_coeffs] = total_zeros;
+            if (val == -1 || val == +1)
+                trailing_ones++;
+            else
+                trailing_ones = 0;
+            total_coeffs++;
+            total_zeros = 0;
+        }
+        else
+            total_zeros++;
+    }
+    if (trailing_ones > 3)
+        trailing_ones = 3;
+
+    total_zeros = len - total_zeros - total_coeffs; // The actual value of zeros (except the zeros at the end)
+    numlevels = total_coeffs - trailing_ones;
+
+    // Encode coeff_token. This is different for Chroma DC values
+
+    if (!is_chroma_dc)
+    {
+        int lookupTable = h264cavlc_get_lookup_table(na,nb);
+#ifdef DEBUG_H264CAVLC
+//        av_log(NULL, AV_LOG_DEBUG, "Luma: vlc=%d #c=%d #t1=%d\n", lookupTable, total_coeffs, trailing_ones);
+#endif
+        h264cavlc_encode_vlc_coefftoken(b,lookupTable,total_coeffs,trailing_ones);
+    }
+    else
+    {
+#ifdef DEBUG_H264CAVLC
+//        av_log(NULL, AV_LOG_DEBUG, "Chroma: #c=%d #t1=%d\n", total_coeffs, trailing_ones);
+#endif
+        h264cavlc_encode_vlc_coefftoken_chromadc(b,total_coeffs,trailing_ones);
+    }
+    if (total_coeffs == 0) // Only zeros here, nothing left to do
+        return 0;
+
+    // Encode the trailing one sign bits
+
+    for (i = total_coeffs-1, t = trailing_ones ; t > 0 ; i--, t--)
+    {
+        put_bits(b,1, levels[i] <= 0);
+    }
+
+    // Encode levels of the remaining nonzero coefficients
+
+    if (numlevels > 0)
+    {
+        int level_two_or_higher = 1;
+        int firstlevel = 1;
+        int vlcnum;
+
+        if (total_coeffs > 3 && trailing_ones == 3)
+            level_two_or_higher = 0;
+
+        vlcnum = total_coeffs > 10 && trailing_ones < 3;
+
+        for (i = numlevels-1 ; i >= 0 ; i--)
+        {
+            int16_t val = levels[i];
+            int16_t level = ABS(val);
+
+            if (level_two_or_higher)
+            {
+                val -= (val>>15)|1;
+                level_two_or_higher = 0;
+            }
+
+#ifdef DEBUG_H264CAVLC
+//            av_log(NULL, AV_LOG_DEBUG, "Encoding level %d with vlc %d\n",val,vlcnum);
+#endif
+            h264cavlc_encode_vlc_level(b,vlcnum,val);
+
+            // update VLC table
+            if (vlcnum < 6 && level > increment_vlcnum[vlcnum])
+                vlcnum++;
+
+            if (firstlevel)
+            {
+                firstlevel = 0;
+                if (level > 3)
+                    vlcnum = 2;
+            }
+        }
+    }
+
+    // If necessary, encode the amount of non-trailing zeros
+
+    if (total_coeffs < len)
+    {
+        int vlcnum = total_coeffs-1;
+
+#ifdef DEBUG_H264CAVLC
+//        av_log(NULL, AV_LOG_DEBUG, "Encoding total_zeros %d with vlc %d\n",total_zeros,vlcnum);
+#endif
+
+        if (!is_chroma_dc)
+            h264cavlc_encode_vlc_totalzeros(b,vlcnum,total_zeros);
+        else
+            h264cavlc_encode_vlc_totalzeros_chromadc(b,vlcnum,total_zeros);
+    }
+
+    // If necessary, encode the run_before values
+
+    for (i = total_coeffs-1 ; i > 0 && total_zeros > 0 ; i--)
+    {
+        int runbefore = zeros[i];
+        int vlcnum = FFMIN(total_zeros-1, 6);
+
+#ifdef DEBUG_H264CAVLC
+//        av_log(NULL, AV_LOG_DEBUG, "Encoding run %d with vlc %d\n",runbefore,vlcnum);
+#endif
+
+        h264cavlc_encode_vlc_run(b,vlcnum,runbefore);
+        total_zeros -= runbefore;
+    }
+
+    return total_coeffs;
+}
+
diff --git a/libavcodec/h264data.h b/libavcodec/h264data.h
index 1dd9daf..6ee2204 100644
--- a/libavcodec/h264data.h
+++ b/libavcodec/h264data.h
@@ -51,6 +51,24 @@ #define DC_128_PRED8x8        6
 
 #define EXTENDED_SAR          255
 
+/* NAL unit types */
+enum {
+NAL_SLICE=1,
+NAL_DPA,
+NAL_DPB,
+NAL_DPC,
+NAL_IDR_SLICE,
+NAL_SEI,
+NAL_SPS,
+NAL_PPS,
+NAL_AUD,
+NAL_END_SEQUENCE,
+NAL_END_STREAM,
+NAL_FILLER_DATA,
+NAL_SPS_EXT,
+NAL_AUXILIARY_SLICE=19
+};
+
 static const AVRational pixel_aspect[14]={
  {0, 1},
  {1, 1},
@@ -486,15 +504,6 @@ static const PMbInfo b_sub_mb_type_info[
 {MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
 };
 
-
-static const uint8_t rem6[52]={
-0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-};
-
-static const uint8_t div6[52]={
-0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
-};
-
 static const uint8_t default_scaling4[2][16]={
 {   6,13,20,28,
    13,20,28,32,
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
new file mode 100644
index 0000000..ba2292f
--- /dev/null
+++ b/libavcodec/h264dsp.c
@@ -0,0 +1,260 @@
+/*
+ * H.264/MPEG-4 Part 10 (Base profile) encoder.
+ *
+ * DSP functions
+ *
+ * Copyright (c) 2006 Expertisecentrum Digitale Media, UHasselt
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/**
+ * @file h264dsp.c
+ * H.264 encoder related DSP utils
+ *
+ */
+
+
+#include <stdio.h>
+#include "dsputil.h"
+
+extern const int16_t ff_h264_MF00[6];
+extern const int16_t ff_h264_V00[6];
+extern const uint8_t div6[52];
+extern const uint8_t rem6[52];
+
+#define COPY_SIGN(A, B) ((A ^ (B>>31)) - (B>>31))
+
+#define FF_H264_TRANSFORM_DCT_QUANT_C_ELEMENT(X, Y) \
+    block[X][Y] = COPY_SIGN(((ABS((int32_t)outblock[X][Y])*MF[mod][X][Y]+f) >> qbits), outblock[X][Y])
+
+#define FF_H264_TRANSFORM_DCT_QUANT_C_LINE(X) \
+    FF_H264_TRANSFORM_DCT_QUANT_C_ELEMENT(X,0); \
+    FF_H264_TRANSFORM_DCT_QUANT_C_ELEMENT(X,1); \
+    FF_H264_TRANSFORM_DCT_QUANT_C_ELEMENT(X,2); \
+    FF_H264_TRANSFORM_DCT_QUANT_C_ELEMENT(X,3);
+
+// we'll always work with transposed input blocks, to avoid having to make a distinction between
+// C and mmx implementations
+void ff_h264_transform_dct_quant_c(int16_t block[4][4], int QP, int dontscaleDC) // y,x indexing
+{
+    static const int16_t MF[6][4][4] =
+    {
+        { { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}, { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243} },
+        { { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}, { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660} },
+        { { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}, { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194} },
+        { {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}, {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647} },
+        { {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}, {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355} },
+        { {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}, {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893} }
+    };
+    int32_t qbits = 15 + div6[QP];
+    int32_t f = (1<<qbits)/3;
+    int mod = rem6[QP];
+    DCTELEM outblock[4][4];
+
+    ff_h264_dct_c(block, outblock);
+
+    if (dontscaleDC)
+        block[0][0] = outblock[0][0];
+    else
+        FF_H264_TRANSFORM_DCT_QUANT_C_ELEMENT(0,0);
+    FF_H264_TRANSFORM_DCT_QUANT_C_ELEMENT(0,1);
+    FF_H264_TRANSFORM_DCT_QUANT_C_ELEMENT(0,2);
+    FF_H264_TRANSFORM_DCT_QUANT_C_ELEMENT(0,3);
+
+    FF_H264_TRANSFORM_DCT_QUANT_C_LINE(1);
+    FF_H264_TRANSFORM_DCT_QUANT_C_LINE(2);
+    FF_H264_TRANSFORM_DCT_QUANT_C_LINE(3);
+}
+
+#define H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT(X, Y) \
+        elem[X][Y] = ((int32_t)block[X][Y]*V[mod][X][Y]) << shift;
+
+#define H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_LINE(X) \
+    H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT(X, 0) \
+    H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT(X, 1) \
+    H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT(X, 2) \
+    H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT(X, 3)
+
+#define H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT2(X, Y) \
+    elem[X][Y] = ((int32_t)block[X][Y]*V[mod][X][Y]+add) >> shift;
+
+#define H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_LINE2(X) \
+    H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT2(X, 0) \
+    H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT2(X, 1) \
+    H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT2(X, 2) \
+    H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT2(X, 3)
+
+void ff_h264_transform_inverse_quant_dct_add_c(int16_t block[4][4], int QP, int dontscaleDC, uint8_t *dst, int stride) // y,x indexing
+{
+    static const int16_t V[6][4][4] =
+    {
+        { { 10*16, 13*16, 10*16, 13*16}, { 13*16, 16*16, 13*16, 16*16}, { 10*16, 13*16, 10*16, 13*16}, { 13*16, 16*16, 13*16, 16*16} },
+        { { 11*16, 14*16, 11*16, 14*16}, { 14*16, 18*16, 14*16, 18*16}, { 11*16, 14*16, 11*16, 14*16}, { 14*16, 18*16, 14*16, 18*16} },
+        { { 13*16, 16*16, 13*16, 16*16}, { 16*16, 20*16, 16*16, 20*16}, { 13*16, 16*16, 13*16, 16*16}, { 16*16, 20*16, 16*16, 20*16} },
+        { { 14*16, 18*16, 14*16, 18*16}, { 18*16, 23*16, 18*16, 23*16}, { 14*16, 18*16, 14*16, 18*16}, { 18*16, 23*16, 18*16, 23*16} },
+        { { 16*16, 20*16, 16*16, 20*16}, { 20*16, 25*16, 20*16, 25*16}, { 16*16, 20*16, 16*16, 20*16}, { 20*16, 25*16, 20*16, 25*16} },
+        { { 18*16, 23*16, 18*16, 23*16}, { 23*16, 29*16, 23*16, 29*16}, { 18*16, 23*16, 18*16, 23*16}, { 23*16, 29*16, 23*16, 29*16} }
+    };
+    DCTELEM elem[4][4];
+    int mod = rem6[QP];
+
+    if (QP >= 24)
+    {
+        int shift = div6[QP]-4;
+
+        if (dontscaleDC)
+            elem[0][0] = block[0][0];
+        else
+            H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT(0, 0);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT(0, 1);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT(0, 2);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT(0, 3);
+
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_LINE(1);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_LINE(2);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_LINE(3);
+    }
+    else
+    {
+        int add = (1<<(3-div6[QP]));
+        int shift = (4-div6[QP]);
+        if (dontscaleDC)
+            elem[0][0] = block[0][0];
+        else
+            H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT2(0, 0);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT2(0, 1);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT2(0, 2);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_ELEMENT2(0, 3);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_LINE2(1);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_LINE2(2);
+        H264_TRANSFORM_INVERSE_QUANT_DCT_ADD_C_LINE2(3);
+        if (dontscaleDC)
+            elem[0][0] = block[0][0];
+    }
+
+    ff_h264_idct_add_c(dst,&(elem[0][0]),stride);
+}
+
+#define FF_H264_HADAMARD_QUANT_4X4_C_ELEMENT(A, B) \
+    Y[A][B] = COPY_SIGN((((ABS(Y[A][B])>>1) * MF + f2) >> shift), Y[A][B]);
+
+#define FF_H264_HADAMARD_QUANT_4X4_C_LINE(X) \
+    FF_H264_HADAMARD_QUANT_4X4_C_ELEMENT(X, 0); \
+    FF_H264_HADAMARD_QUANT_4X4_C_ELEMENT(X, 1); \
+    FF_H264_HADAMARD_QUANT_4X4_C_ELEMENT(X, 2); \
+    FF_H264_HADAMARD_QUANT_4X4_C_ELEMENT(X, 3);
+
+/**
+ * |ZD(i,j)| = (|YD(i,j)| MF(0,0) + 2 f) >> (qbits + 1)
+ *
+ */
+void ff_h264_hadamard_quant_4x4_c(DCTELEM Y[4][4], int QP)
+{
+    int qbits = 15 + div6[QP];
+    int f2 = (1 << qbits) * (2/3);
+    int shift = (qbits + 1);
+    int mod = rem6[QP];
+
+    int32_t MF = ff_h264_MF00[mod];
+
+    FF_H264_HADAMARD_QUANT_4X4_C_LINE(0);
+    FF_H264_HADAMARD_QUANT_4X4_C_LINE(1);
+    FF_H264_HADAMARD_QUANT_4X4_C_LINE(2);
+    FF_H264_HADAMARD_QUANT_4X4_C_LINE(3);
+}
+
+#define H264_HADAMARD_INVQUANT_4X4_C_LOWQP_ELEMENT(A, B) \
+    Y[A][B] = (Y[A][B]*V + f) >> shift;
+
+#define H264_HADAMARD_INVQUANT_4X4_C_LOWQP_LINE(A) \
+    H264_HADAMARD_INVQUANT_4X4_C_LOWQP_ELEMENT(A, 0) \
+    H264_HADAMARD_INVQUANT_4X4_C_LOWQP_ELEMENT(A, 1) \
+    H264_HADAMARD_INVQUANT_4X4_C_LOWQP_ELEMENT(A, 2) \
+    H264_HADAMARD_INVQUANT_4X4_C_LOWQP_ELEMENT(A, 3)
+
+#define H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_ELEMENT(A,B) \
+    Y[A][B] = (Y[A][B]*V) << shift ;
+
+#define H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_LINE(A) \
+    H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_ELEMENT(A, 0) \
+    H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_ELEMENT(A, 1) \
+    H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_ELEMENT(A, 2) \
+    H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_ELEMENT(A, 3)
+
+/*
+ * Only if qpprime_y_zero_transform_bypass_flag == 0
+ */
+void ff_h264_hadamard_invquant_4x4_c(DCTELEM Y[4][4], int QP)
+{
+    int mod = rem6[QP];
+
+    if (QP < 36)
+    {
+        int qbits = div6[QP];
+        int shift = 6-qbits;
+        int f = (1 << (5-qbits));
+
+        int32_t V = ff_h264_V00[mod];
+
+        H264_HADAMARD_INVQUANT_4X4_C_LOWQP_LINE(0);
+        H264_HADAMARD_INVQUANT_4X4_C_LOWQP_LINE(1);
+        H264_HADAMARD_INVQUANT_4X4_C_LOWQP_LINE(2);
+        H264_HADAMARD_INVQUANT_4X4_C_LOWQP_LINE(3);
+    }
+    else
+    {
+        int shift = div6[QP] - 6;
+        int32_t V = ff_h264_V00[mod];
+
+        H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_LINE(0);
+        H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_LINE(1);
+        H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_LINE(2);
+        H264_HADAMARD_INVQUANT_4X4_C_HIGHQP_LINE(3);
+    }
+}
+
+#define FF_H264_HADAMARD_QUANT_2X2_C_ELEMENT(A, B) \
+    Y[A][B] = COPY_SIGN(((ABS(Y[A][B])*MF + f2) >> shift),Y[A][B])
+
+/**
+ * |ZD(i,j)| = (|YD(i,j)| MF(0,0) + 2 f) >> (qbits + 1)
+ *
+ */
+void ff_h264_hadamard_quant_2x2_c(int16_t Y[2][2], int QP)
+{
+    int qbits = 15 + div6[QP];
+    int f2 = ((1 << qbits) / 3)*2;
+    int shift = qbits+1;
+    int32_t MF = ff_h264_MF00[rem6[QP]];
+
+    FF_H264_HADAMARD_QUANT_2X2_C_ELEMENT(0, 0);
+    FF_H264_HADAMARD_QUANT_2X2_C_ELEMENT(0, 1);
+    FF_H264_HADAMARD_QUANT_2X2_C_ELEMENT(1, 0);
+    FF_H264_HADAMARD_QUANT_2X2_C_ELEMENT(1, 1);
+}
+
+void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx)
+{
+    c->h264_dct = ff_h264_dct_c;
+    c->h264_idct_notranspose_add = ff_h264_idct_add_c;
+    c->h264_hadamard_mult4x4 = ff_h264_hadamard_mult4x4_c;
+    c->h264_hadamard_quant_2x2 = ff_h264_hadamard_quant_2x2_c;
+    c->h264_hadamard_quant_4x4 = ff_h264_hadamard_quant_4x4_c;
+    c->h264_hadamard_invquant_4x4 = ff_h264_hadamard_invquant_4x4_c;
+    c->h264_transform_dct_quant = ff_h264_transform_dct_quant_c;
+    c->h264_transform_inverse_quant_dct_add = ff_h264_transform_inverse_quant_dct_add_c;
+}
+
diff --git a/libavcodec/h264enc.c b/libavcodec/h264enc.c
new file mode 100644
index 0000000..91b7eaa
--- /dev/null
+++ b/libavcodec/h264enc.c
@@ -0,0 +1,2501 @@
+/*
+ * H.264 encoder
+ * Copyright (c) 2006 Expertisecentrum Digitale Media, UHasselt
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "common.h"
+#include "avcodec.h"
+#include "bitstream.h"
+#include "golomb.h"
+#include "mpegvideo.h"
+#include "h264data.h"
+#include "dsputil.h"
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "h264enc.h"
+
+
+#define DEFAULT_QP    30
+#define NUMBER_OF_FRAMES 2
+#define RATECONTROLINTERVAL 0.5
+#define CHROMA_QP_INDEX_OFFSET_MAX 12
+#define CHROMA_QP_INDEX_OFFSET_MIN -12
+
+#define H264_COPY_4X4BLOCK_TRANSPOSED_PART(A, xoffset, yoffset, dest, src1, src2) \
+    dest[0][A] = src1[yoffset+A][xoffset+0]-src2[yoffset+A][xoffset+0]; \
+    dest[1][A] = src1[yoffset+A][xoffset+1]-src2[yoffset+A][xoffset+1]; \
+    dest[2][A] = src1[yoffset+A][xoffset+2]-src2[yoffset+A][xoffset+2]; \
+    dest[3][A] = src1[yoffset+A][xoffset+3]-src2[yoffset+A][xoffset+3];
+
+#define H264_COPY_4X4BLOCK_TRANSPOSED(xoffset,yoffset,dest,src1,src2) \
+{ \
+    H264_COPY_4X4BLOCK_TRANSPOSED_PART(0, xoffset, yoffset, dest, src1, src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED_PART(1, xoffset, yoffset, dest, src1, src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED_PART(2, xoffset, yoffset, dest, src1, src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED_PART(3, xoffset, yoffset, dest, src1, src2); \
+}
+
+#define H264_COPY_16X16BLOCK(dest,src1,src2) \
+{ \
+    H264_COPY_4X4BLOCK_TRANSPOSED(0,0,dest[0][0],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(4,0,dest[0][1],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(8,0,dest[0][2],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(12,0,dest[0][3],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(0,4,dest[1][0],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(4,4,dest[1][1],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(8,4,dest[1][2],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(12,4,dest[1][3],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(0,8,dest[2][0],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(4,8,dest[2][1],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(8,8,dest[2][2],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(12,8,dest[2][3],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(0,12,dest[3][0],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(4,12,dest[3][1],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(8,12,dest[3][2],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(12,12,dest[3][3],src1,src2); \
+}
+
+#define H264_COPY_8X8BLOCK(dest,src1,src2) \
+{ \
+    H264_COPY_4X4BLOCK_TRANSPOSED(0,0,dest[0][0],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(4,0,dest[0][1],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(0,4,dest[1][0],src1,src2); \
+    H264_COPY_4X4BLOCK_TRANSPOSED(4,4,dest[1][1],src1,src2); \
+}
+
+int h264cavlc_encode(PutBitContext *b, int16_t *coefficients, int len, int nA, int nB, int isChromaDC);
+void h264cavlc_generate_tables();
+void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
+
+void pred16x16_vertical_c(uint8_t *src, int stride);
+void pred8x8_vertical_c(uint8_t *src, int stride);
+void pred16x16_horizontal_c(uint8_t *src, int stride);
+void pred8x8_horizontal_c(uint8_t *src, int stride);
+void pred16x16_plane_c(uint8_t *src, int stride);
+void pred8x8_plane_c(uint8_t *src, int stride);
+void pred16x16_128_dc_c(uint8_t *src, int stride);
+void pred8x8_128_dc_c(uint8_t *src, int stride);
+void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size);
+
+static int8_t mbtype_map[4][3][2];
+
+/**
+ * For a specific picture, this function sets the correct Y,U and V start addresses for each macroblock
+ */
+static void ff_h264_assign_macroblocks(AVPicture *p, MacroBlock **mb_map, int mb_width, int mb_height, int setneighbours)
+{
+    int y,x,i;
+    int Ylinesize = p->linesize[0];
+    int Ulinesize = p->linesize[1];
+    int Vlinesize = p->linesize[2];
+
+    if (!setneighbours)
+    {
+        for (y = 0 ; y < mb_height ; y++)
+        {
+            int y16 = y << 4;
+            int y8 = y << 3;
+
+            for (x = 0 ; x < mb_width ; x++)
+            {
+                int x16 = x << 4;
+                int x8 = x << 3;
+
+                for (i = 0 ; i < 8 ; i++)
+                {
+                    int ypos = y8+i;
+                    mb_map[y][x].U[i] = p->data[1]+(x8+ypos*Ulinesize);
+                    mb_map[y][x].V[i] = p->data[2]+(x8+ypos*Vlinesize);
+                }
+                for (i = 0 ; i < 16 ; i++)
+                    mb_map[y][x].Y[i] = p->data[0]+(x16+(y16+i)*Ylinesize);
+
+                mb_map[y][x].topblock = NULL;
+                mb_map[y][x].leftblock = NULL;
+                mb_map[y][x].rightblock = NULL;
+                mb_map[y][x].available = 0;
+            }
+        }
+    }
+    else
+    {
+        y = 0;
+        x = 0;
+        for (i = 0 ; i < 8 ; i++)
+        {
+            mb_map[y][x].U[i] = p->data[1]+((x<<3)+((y<<3)+i)*Ulinesize);
+            mb_map[y][x].V[i] = p->data[2]+((x<<3)+((y<<3)+i)*Vlinesize);
+        }
+        for (i = 0 ; i < 16 ; i++)
+            mb_map[y][x].Y[i] = p->data[0]+((x<<4)+((y<<4)+i)*Ylinesize);
+
+        mb_map[y][x].topblock = NULL;
+        mb_map[y][x].leftblock = NULL;
+
+        if (x < mb_width-1)
+            mb_map[y][x].rightblock = &(mb_map[y][x+1]);
+        else
+            mb_map[y][x].rightblock = NULL;
+        mb_map[y][x].available = 0;
+
+        y = 0;
+        for (x = 1 ; x < mb_width ; x++)
+        {
+            for (i = 0 ; i < 8 ; i++)
+            {
+                mb_map[y][x].U[i] = p->data[1]+((x<<3)+((y<<3)+i)*Ulinesize);
+                mb_map[y][x].V[i] = p->data[2]+((x<<3)+((y<<3)+i)*Vlinesize);
+            }
+            for (i = 0 ; i < 16 ; i++)
+                mb_map[y][x].Y[i] = p->data[0]+((x<<4)+((y<<4)+i)*Ylinesize);
+
+            mb_map[y][x].topblock = NULL;
+            mb_map[y][x].leftblock = &(mb_map[y][x-1]);
+            if (x < mb_width-1)
+                mb_map[y][x].rightblock = &(mb_map[y][x+1]);
+            else
+                mb_map[y][x].rightblock = NULL;
+            mb_map[y][x].available = 0;
+        }
+
+        x = 0;
+        for (y = 1 ; y < mb_height ; y++)
+        {
+            for (i = 0 ; i < 8 ; i++)
+            {
+                mb_map[y][x].U[i] = p->data[1]+((x<<3)+((y<<3)+i)*Ulinesize);
+                mb_map[y][x].V[i] = p->data[2]+((x<<3)+((y<<3)+i)*Vlinesize);
+            }
+            for (i = 0 ; i < 16 ; i++)
+                mb_map[y][x].Y[i] = p->data[0]+((x<<4)+((y<<4)+i)*Ylinesize);
+
+            mb_map[y][x].topblock = &(mb_map[y-1][x]);
+            mb_map[y][x].leftblock = NULL;
+            if (x < mb_width-1)
+                mb_map[y][x].rightblock = &(mb_map[y][x+1]);
+            else
+                mb_map[y][x].rightblock = NULL;
+            mb_map[y][x].available = 0;
+        }
+
+        for (y = 1 ; y < mb_height ; y++)
+        {
+            for (x = 1 ; x < mb_width ; x++)
+            {
+                for (i = 0 ; i < 8 ; i++)
+                {
+                    mb_map[y][x].U[i] = p->data[1]+((x<<3)+((y<<3)+i)*Ulinesize);
+                    mb_map[y][x].V[i] = p->data[2]+((x<<3)+((y<<3)+i)*Vlinesize);
+                }
+                for (i = 0 ; i < 16 ; i++)
+                    mb_map[y][x].Y[i] = p->data[0]+((x<<4)+((y<<4)+i)*Ylinesize);
+
+                mb_map[y][x].topblock = &(mb_map[y-1][x]);
+                mb_map[y][x].leftblock = &(mb_map[y][x-1]);
+                if (x < mb_width-1)
+                    mb_map[y][x].rightblock = &(mb_map[y][x+1]);
+                else
+                    mb_map[y][x].rightblock = NULL;
+                mb_map[y][x].available = 0;
+            }
+        }
+    }
+}
+
+static void ff_h264_clear_nonzero_markers(MacroBlock **mb_map, int mb_width, int mb_height)
+{
+    int x,y;
+
+    for (y = 0 ; y < mb_height ; y++)
+    {
+        for (x = 0 ; x < mb_width ; x++)
+        {
+            // mark as not available
+
+            memset(&(mb_map[y][x].Y_nonzero[0][0]),0xff,sizeof(int)*16); // set to -1
+            memset(&(mb_map[y][x].U_nonzero[0][0]),0xff,sizeof(int)*4); // set to -1
+            memset(&(mb_map[y][x].V_nonzero[0][0]),0xff,sizeof(int)*4); // set to -1
+
+            mb_map[y][x].available = 0;
+        }
+    }
+}
+
+static void ff_h264_init_tables()
+{
+    int a, b, c;
+    for(a=0; a<4; a++)
+        for(b=0; b<3; b++)
+            for(c=0; c<2; c++)
+                mbtype_map[a][b][c] = 1 + a + 4*(b + 3*c);
+}
+
+static int ff_h264_encoder_init(AVCodecContext *avctx)
+{
+    H264Context *t = (H264Context *)avctx->priv_data;
+    uint8_t *buf;
+    int s, x, y, i, res;
+    int width, height;
+
+    switch(avctx->pix_fmt){
+    case PIX_FMT_YUV420P:
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "format not supported\n");
+        return -1;
+    }
+
+    t->frame_cropping_flag = 0;
+    t->frame_crop_left_offset = 0;
+    t->frame_crop_right_offset = 0;
+    t->frame_crop_top_offset = 0;
+    t->frame_crop_bottom_offset = 0;
+
+    width = avctx->width;
+    height = avctx->height;
+
+    t->mb_width = width/16;
+    t->mb_height = height/16;
+    t->frame_width = width;
+    t->frame_height = height;
+
+    /* If the width is not a multiple of 16, enabling cropping */
+    if (( width % 16) !=0 )
+    {
+        t->frame_cropping_flag = 1;
+        t->frame_crop_left_offset = 0;
+        t->frame_crop_right_offset = (width%16)/2;
+        t->mb_width++;
+    }
+
+    /* If the height is not a multiple of 16, enabling cropping */
+    if (( height % 16) !=0 )
+    {
+        t->frame_cropping_flag = 1;
+        t->frame_crop_top_offset = 0;
+        t->frame_crop_bottom_offset = (height%16)/2;
+        t->mb_height++;
+    }
+
+    /* Round the framesize upwards to a multiple of 16 */
+    width = t->mb_width * 16;
+    height = t->mb_height * 16;
+    t->refframe_width = width;
+    t->refframe_height = height;
+
+    s = avpicture_get_size(avctx->pix_fmt, width, height);
+    res = avpicture_alloc(&t->pi, avctx->pix_fmt, width, height);
+    if (res) {
+        av_log(avctx, AV_LOG_ERROR, "Problem allocating picture\n");
+        return -1;
+    }
+    res = avpicture_alloc(&t->po, avctx->pix_fmt, width, height);
+    if (res) {
+        av_log(avctx, AV_LOG_ERROR, "Problem allocating picture\n");
+        return -1;
+    }
+
+    t->pi_data0 = (uint8_t *)t->pi.data[0];
+    t->po_data0 = (uint8_t *)t->po.data[0];
+    t->bufsize = s*2;
+    t->frame_num = 0;
+
+    t->mb_map = (MacroBlock **)av_malloc(sizeof(MacroBlock*) * t->mb_height);
+    for (y = 0 ; y < t->mb_height ; y++)
+    {
+        t->mb_map[y] = (MacroBlock *)av_malloc(sizeof(MacroBlock) * t->mb_width);
+        for (x = 0 ; x < t->mb_width ; x++)
+        {
+            t->mb_map[y][x].Y_width = 16;
+            t->mb_map[y][x].Y_height = 16;
+        }
+    }
+
+    t->framebufsize = NUMBER_OF_FRAMES;
+    t->reconstructed_frames = (FrameInfo **)av_malloc(sizeof(FrameInfo *)*t->framebufsize);
+
+    for (i = 0 ; i < t->framebufsize ; i++)
+    {
+        t->reconstructed_frames[i] = (FrameInfo *)av_malloc(sizeof(FrameInfo));
+
+        buf = av_malloc(s);
+        avpicture_fill(&(t->reconstructed_frames[i]->reconstructed_picture), buf, PIX_FMT_YUV420P, width, height);
+
+        t->reconstructed_frames[i]->reconstructed_mb_map = (MacroBlock **)av_malloc(sizeof(MacroBlock*) * t->mb_height);
+        for (y = 0 ; y < t->mb_height ; y++)
+        {
+            t->reconstructed_frames[i]->reconstructed_mb_map[y] = (MacroBlock *)av_malloc(sizeof(MacroBlock) * t->mb_width);
+            for (x = 0 ; x < t->mb_width ; x++)
+            {
+                t->reconstructed_frames[i]->reconstructed_mb_map[y][x].Y_width = 16;
+                t->reconstructed_frames[i]->reconstructed_mb_map[y][x].Y_height = 16;
+            }
+        }
+        ff_h264_assign_macroblocks(&(t->reconstructed_frames[i]->reconstructed_picture),t->reconstructed_frames[i]->reconstructed_mb_map,t->mb_width,t->mb_height,1);
+    }
+
+    if (!avctx->global_quality)
+    {
+        t->QP = DEFAULT_QP;
+        t->use_fixed_qp = 0;
+    }
+    else
+    {
+        t->QP = avctx->global_quality / FF_QP2LAMBDA;
+        t->use_fixed_qp = 1;
+    }
+    t->PPS_QP = t->QP;
+
+    t->chroma_qp_index_offset = avctx->chromaoffset;
+    t->chroma_qp_index_offset = clip(t->chroma_qp_index_offset, CHROMA_QP_INDEX_OFFSET_MIN, CHROMA_QP_INDEX_OFFSET_MAX);
+    t->IDRcount = 64;
+    t->IDR_frame_num = 0;
+
+    // init dsp
+    dsputil_init(&(t->dspcontext),avctx);
+    t->Y_stride = t->reconstructed_frames[0]->reconstructed_picture.linesize[0];
+    t->U_stride = t->reconstructed_frames[0]->reconstructed_picture.linesize[1];
+    t->V_stride = t->reconstructed_frames[0]->reconstructed_picture.linesize[2];
+
+    // Create an AVPicture instance with the same dimensions as the reference pictures to hold a copy
+    // of the input frame
+    buf = (uint8_t *)av_malloc(s);
+    avpicture_fill(&(t->input_frame_copy), buf, PIX_FMT_YUV420P, width, height);
+    memset(buf,0,s);
+
+    // Assign the macroblock map to this copy of the input image
+    ff_h264_assign_macroblocks(&(t->input_frame_copy),t->mb_map,t->mb_width,t->mb_height,0);
+
+    // Blocksize history, we use a separate history for I and P frame
+    t->milliseconds_per_frame = (1000*avctx->time_base.num)/avctx->time_base.den;
+    t->blocksize_history_length = (RATECONTROLINTERVAL*avctx->time_base.den)/avctx->time_base.num;
+    t->blocksize_history = (int64_t *)av_malloc(sizeof(int64_t)*t->blocksize_history_length);
+    t->blocksize_history_pos = 0;
+    t->blocksize_history_num_filled = 0;
+    t->blocksize_history_total_milliseconds = 0;
+    t->blocksize_history_sum = 0;
+    for (i = 0 ; i < t->blocksize_history_length ; i++)
+        t->blocksize_history[i] = 0;
+
+    h264cavlc_generate_tables();
+    ff_h264_init_tables();
+    return 0;
+}
+
+/**
+ * @param b2 the data which should be escaped
+ * @param dest the target buffer, dst+1 == src is allowed as a special case
+ * @param length the length of the src data
+ * @param destsize the length of the dst array
+ * @returns pointer to current position in the output buffer or NULL if an error occured
+ */
+static uint8_t *ff_h264_write_nal_unit(int nal_ref_idc, int nal_unit_type, uint8_t *dest, int *destsize,
+                          PutBitContext *b2)
+{
+    PutBitContext b;
+    int i, destpos, rbsplen, escape_count;
+    uint8_t *rbsp, temp;
+
+    // Align b2 on a byte boundary
+
+    align_put_bits(b2);
+    rbsplen = put_bits_count(b2)/8;
+    flush_put_bits(b2);
+    rbsp = b2->buf;
+
+    init_put_bits(&b,dest,*destsize);
+
+    put_bits(&b,16,0);
+    put_bits(&b,16,0x01);
+
+    put_bits(&b,1,0); // forbidden zero bit
+    put_bits(&b,2,nal_ref_idc); // nal_ref_idc
+    put_bits(&b,5,nal_unit_type); // nal_unit_type
+
+    flush_put_bits(&b);
+
+    destpos = 5;
+    escape_count= 0;
+    for(i=0; i<rbsplen; i+=2){
+        if(rbsp[i]) continue;
+        if(i>0 && rbsp[i-1]==0)
+            i--;
+        if(i+2<rbsplen && rbsp[i+1]==0 && rbsp[i+2]<=3){
+            escape_count++;
+            i+=2;
+        }
+    }
+
+    if(escape_count==0){
+        if(dest+destpos != rbsp) {
+            memcpy(dest+destpos, rbsp, rbsplen);
+            *destsize -= (rbsplen+destpos); 
+        }
+        return dest+rbsplen+destpos;
+    }
+    if(rbsplen + escape_count + 1> *destsize)
+    {
+        av_log(NULL, AV_LOG_ERROR, "Destination buffer too small!\n");
+        return NULL;
+    }
+    //this should be damn rare (hopefully)
+    for (i = 0 ; i < rbsplen ; i++)
+    {
+        if (i + 2 < rbsplen && (rbsp[i] == 0 && rbsp[i+1] == 0 && rbsp[i+2] < 4))
+        {
+            dest[destpos++] = rbsp[i++];
+            dest[destpos++] = rbsp[i];
+            dest[destpos++] = 0x03; // emulation prevention byte
+        }
+        else
+            dest[destpos++] = rbsp[i];
+    }
+    *destsize -= destpos;
+    return dest+destpos;
+}
+
+static void ff_h264_encode_I_PCM(MacroBlock *mb, PutBitContext *b, MacroBlock *copy_mb)
+{
+    int w = mb->Y_width;
+    int h = mb->Y_height;
+    int x,y;
+
+    set_ue_golomb(b, 25); // mb_type = I_PCM
+    align_put_bits(b);
+
+    // Y
+
+    for (y = 0 ; y < h ; y++)
+    {
+        for (x = 0 ; x < w ; x++)
+            put_bits(b,8,mb->Y[y][x]);
+        for ( ; x < 16 ; x++)
+            put_bits(b,8,0);
+    }
+    for ( ; y < 16 ; y++)
+    {
+        for (x = 0 ; x < 16 ; x++)
+            put_bits(b,8,0);
+    }
+
+    // copy Y
+
+    for (y = 0 ; y < h ; y++)
+        for (x = 0 ; x < w ; x++)
+            copy_mb->Y[y][x] = mb->Y[y][x];
+
+    w >>= 1;
+    h >>= 1;
+
+    // U
+
+    for (y = 0 ; y < h ; y++)
+    {
+        for (x = 0 ; x < w ; x++)
+            put_bits(b,8,mb->U[y][x]);
+        for ( ; x < 8 ; x++)
+            put_bits(b,8,0);
+    }
+    for ( ; y < 8 ; y++)
+    {
+        for (x = 0 ; x < 8 ; x++)
+            put_bits(b,8,0);
+    }
+
+    // V
+
+    for (y = 0 ; y < h ; y++)
+    {
+        for (x = 0 ; x < w ; x++)
+            put_bits(b,8,mb->V[y][x]);
+        for ( ; x < 8 ; x++)
+            put_bits(b,8,0);
+    }
+    for ( ; y < 8 ; y++)
+    {
+        for (x = 0 ; x < 8 ; x++)
+            put_bits(b,8,0);
+    }
+
+    // copy U and V
+
+    for (y = 0 ; y < h ; y++)
+    {
+        for (x = 0 ; x < w ; x++)
+        {
+            copy_mb->U[y][x] = mb->U[y][x];
+            copy_mb->V[y][x] = mb->V[y][x];
+        }
+    }
+
+    // store the nonzero counts (set to 16 for I_PCM blocks)
+    fill_rectangle(copy_mb->Y_nonzero, 4, 4, 4, 16, sizeof(int));
+    fill_rectangle(copy_mb->U_nonzero, 2, 2, 2, 16, sizeof(int));
+    fill_rectangle(copy_mb->V_nonzero, 2, 2, 2, 16, sizeof(int));
+
+    copy_mb->available = 1;
+}
+
+// inblock is transposed, outblock isn't
+void ff_h264_dct_c(DCTELEM inblock[4][4],DCTELEM outblock[4][4])
+{
+    DCTELEM pieces[4][4];
+
+    pieces[0][0] = inblock[0][0]+inblock[1][0]+inblock[2][0]+inblock[3][0];
+    pieces[0][1] = inblock[0][1]+inblock[1][1]+inblock[2][1]+inblock[3][1];
+    pieces[0][2] = inblock[0][2]+inblock[1][2]+inblock[2][2]+inblock[3][2];
+    pieces[0][3] = inblock[0][3]+inblock[1][3]+inblock[2][3]+inblock[3][3];
+
+    pieces[1][0] = (inblock[0][0]<<1)+inblock[1][0]-inblock[2][0]-(inblock[3][0]<<1);
+    pieces[1][1] = (inblock[0][1]<<1)+inblock[1][1]-inblock[2][1]-(inblock[3][1]<<1);
+    pieces[1][2] = (inblock[0][2]<<1)+inblock[1][2]-inblock[2][2]-(inblock[3][2]<<1);
+    pieces[1][3] = (inblock[0][3]<<1)+inblock[1][3]-inblock[2][3]-(inblock[3][3]<<1);
+
+    pieces[2][0] = inblock[0][0]-inblock[1][0]-inblock[2][0]+inblock[3][0];
+    pieces[2][1] = inblock[0][1]-inblock[1][1]-inblock[2][1]+inblock[3][1];
+    pieces[2][2] = inblock[0][2]-inblock[1][2]-inblock[2][2]+inblock[3][2];
+    pieces[2][3] = inblock[0][3]-inblock[1][3]-inblock[2][3]+inblock[3][3];
+
+    pieces[3][0] = inblock[0][0]-(inblock[1][0]<<1)+(inblock[2][0]<<1)-inblock[3][0];
+    pieces[3][1] = inblock[0][1]-(inblock[1][1]<<1)+(inblock[2][1]<<1)-inblock[3][1];
+    pieces[3][2] = inblock[0][2]-(inblock[1][2]<<1)+(inblock[2][2]<<1)-inblock[3][2];
+    pieces[3][3] = inblock[0][3]-(inblock[1][3]<<1)+(inblock[2][3]<<1)-inblock[3][3];
+
+    outblock[0][0] = pieces[0][0]+pieces[0][1]+pieces[0][2]+pieces[0][3];
+    outblock[0][1] = pieces[1][0]+pieces[1][1]+pieces[1][2]+pieces[1][3];
+    outblock[0][2] = pieces[2][0]+pieces[2][1]+pieces[2][2]+pieces[2][3];
+    outblock[0][3] = pieces[3][0]+pieces[3][1]+pieces[3][2]+pieces[3][3];
+
+    outblock[1][0] = (pieces[0][0] << 1)+pieces[0][1]-pieces[0][2]-(pieces[0][3]<<1);
+    outblock[1][1] = (pieces[1][0] << 1)+pieces[1][1]-pieces[1][2]-(pieces[1][3]<<1);
+    outblock[1][2] = (pieces[2][0] << 1)+pieces[2][1]-pieces[2][2]-(pieces[2][3]<<1);
+    outblock[1][3] = (pieces[3][0] << 1)+pieces[3][1]-pieces[3][2]-(pieces[3][3]<<1);
+
+    outblock[2][0] = pieces[0][0]-pieces[0][1]-pieces[0][2]+pieces[0][3];
+    outblock[2][1] = pieces[1][0]-pieces[1][1]-pieces[1][2]+pieces[1][3];
+    outblock[2][2] = pieces[2][0]-pieces[2][1]-pieces[2][2]+pieces[2][3];
+    outblock[2][3] = pieces[3][0]-pieces[3][1]-pieces[3][2]+pieces[3][3];
+
+    outblock[3][0] = pieces[0][0]-(pieces[0][1]<<1)+(pieces[0][2]<<1)-pieces[0][3];
+    outblock[3][1] = pieces[1][0]-(pieces[1][1]<<1)+(pieces[1][2]<<1)-pieces[1][3];
+    outblock[3][2] = pieces[2][0]-(pieces[2][1]<<1)+(pieces[2][2]<<1)-pieces[2][3];
+    outblock[3][3] = pieces[3][0]-(pieces[3][1]<<1)+(pieces[3][2]<<1)-pieces[3][3];
+}
+
+
+void ff_h264_hadamard_mult4x4_c(DCTELEM Y[4][4])
+{
+    DCTELEM pieces[4][4];
+
+    pieces[0][0] = Y[0][0]+Y[0][1]+Y[0][2]+Y[0][3];
+    pieces[0][1] = Y[1][0]+Y[1][1]+Y[1][2]+Y[1][3];
+    pieces[0][2] = Y[2][0]+Y[2][1]+Y[2][2]+Y[2][3];
+    pieces[0][3] = Y[3][0]+Y[3][1]+Y[3][2]+Y[3][3];
+
+    pieces[1][0] = Y[0][0]+Y[0][1]-Y[0][2]-Y[0][3];
+    pieces[1][1] = Y[1][0]+Y[1][1]-Y[1][2]-Y[1][3];
+    pieces[1][2] = Y[2][0]+Y[2][1]-Y[2][2]-Y[2][3];
+    pieces[1][3] = Y[3][0]+Y[3][1]-Y[3][2]-Y[3][3];
+
+    pieces[2][0] = Y[0][0]-Y[0][1]-Y[0][2]+Y[0][3];
+    pieces[2][1] = Y[1][0]-Y[1][1]-Y[1][2]+Y[1][3];
+    pieces[2][2] = Y[2][0]-Y[2][1]-Y[2][2]+Y[2][3];
+    pieces[2][3] = Y[3][0]-Y[3][1]-Y[3][2]+Y[3][3];
+
+    pieces[3][0] = Y[0][0]-Y[0][1]+Y[0][2]-Y[0][3];
+    pieces[3][1] = Y[1][0]-Y[1][1]+Y[1][2]-Y[1][3];
+    pieces[3][2] = Y[2][0]-Y[2][1]+Y[2][2]-Y[2][3];
+    pieces[3][3] = Y[3][0]-Y[3][1]+Y[3][2]-Y[3][3];
+
+    Y[0][0] = pieces[0][0]+pieces[0][1]+pieces[0][2]+pieces[0][3];
+    Y[0][1] = pieces[1][0]+pieces[1][1]+pieces[1][2]+pieces[1][3];
+    Y[0][2] = pieces[2][0]+pieces[2][1]+pieces[2][2]+pieces[2][3];
+    Y[0][3] = pieces[3][0]+pieces[3][1]+pieces[3][2]+pieces[3][3];
+
+    Y[1][0] = pieces[0][0]+pieces[0][1]-pieces[0][2]-pieces[0][3];
+    Y[1][1] = pieces[1][0]+pieces[1][1]-pieces[1][2]-pieces[1][3];
+    Y[1][2] = pieces[2][0]+pieces[2][1]-pieces[2][2]-pieces[2][3];
+    Y[1][3] = pieces[3][0]+pieces[3][1]-pieces[3][2]-pieces[3][3];
+
+    Y[2][0] = pieces[0][0]-pieces[0][1]-pieces[0][2]+pieces[0][3];
+    Y[2][1] = pieces[1][0]-pieces[1][1]-pieces[1][2]+pieces[1][3];
+    Y[2][2] = pieces[2][0]-pieces[2][1]-pieces[2][2]+pieces[2][3];
+    Y[2][3] = pieces[3][0]-pieces[3][1]-pieces[3][2]+pieces[3][3];
+
+    Y[3][0] = pieces[0][0]-pieces[0][1]+pieces[0][2]-pieces[0][3];
+    Y[3][1] = pieces[1][0]-pieces[1][1]+pieces[1][2]-pieces[1][3];
+    Y[3][2] = pieces[2][0]-pieces[2][1]+pieces[2][2]-pieces[2][3];
+    Y[3][3] = pieces[3][0]-pieces[3][1]+pieces[3][2]-pieces[3][3];
+}
+
+static inline void ff_h264_hadamard_mult_2x2(int16_t Y[2][2])
+{
+    int16_t pieces[2][2];
+
+    pieces[0][0] = Y[0][0]+Y[0][1];
+    pieces[0][1] = Y[1][0]+Y[1][1];
+    pieces[1][0] = Y[0][0]-Y[0][1];
+    pieces[1][1] = Y[1][0]-Y[1][1];
+    Y[0][0] = pieces[0][0]+pieces[0][1];
+    Y[0][1] = pieces[1][0]+pieces[1][1];
+    Y[1][0] = pieces[0][0]-pieces[0][1];
+    Y[1][1] = pieces[1][0]-pieces[1][1];
+}
+
+const int16_t ff_h264_MF00[6] = {13107, 11916, 10082, 9362, 8192, 7282};
+const int16_t ff_h264_V00[6] = {10*16, 11*16, 13*16, 14*16, 16*16, 18*16};
+
+static inline void ff_h264_hadamard_invquant_2x2(int16_t Y[2][2], int QP)
+{
+    int32_t V = ff_h264_V00[QP%6];
+    int div = QP/6;
+
+    V <<= div;
+    Y[0][0] = (Y[0][0]*V) >> 5;
+    Y[0][1] = (Y[0][1]*V) >> 5;
+    Y[1][0] = (Y[1][0]*V) >> 5;
+    Y[1][1] = (Y[1][1]*V) >> 5;
+}
+
+#define NEIGHBOUR_SUBTYPE_Y 0
+#define NEIGHBOUR_SUBTYPE_U 1
+#define NEIGHBOUR_SUBTYPE_V 2
+
+#define H264_NEIGHBOUR_COUNT_NONZERO_PLANE(PLANE, P) \
+    { \
+        if (x == 0) \
+        { \
+            MacroBlock *leftmb = mb->leftblock; \
+            if (!leftmb) \
+                *nA = -1; \
+            else \
+                *nA = leftmb->PLANE[y][P]; \
+        } \
+        else \
+            *nA = mb->PLANE[y][x-1]; \
+        if (y == 0) \
+        { \
+            MacroBlock *topmb = mb->topblock; \
+            if (!topmb) \
+                *nB = -1; \
+            else \
+                *nB = topmb->PLANE[P][x]; \
+        } \
+        else \
+            *nB = mb->PLANE[y-1][x]; \
+    }
+
+static inline void ff_h264_neighbour_count_nonzero(MacroBlock *mb, int type, int x, int y, int *nA, int *nB)
+{
+    if (type == NEIGHBOUR_SUBTYPE_Y)
+        H264_NEIGHBOUR_COUNT_NONZERO_PLANE(Y_nonzero, 3)
+    else if (type == NEIGHBOUR_SUBTYPE_U)
+        H264_NEIGHBOUR_COUNT_NONZERO_PLANE(U_nonzero, 1)
+    else
+        H264_NEIGHBOUR_COUNT_NONZERO_PLANE(V_nonzero, 1)
+}
+
+#define H264_COUNT_AND_CLIP(x,count) \
+{\
+    if (x != 0)\
+        count++;\
+    clip(x, -2047, 2047);\
+}
+
+#define H264_COUNT_AND_CLIP_SUBBLOCK(x,count)\
+{\
+    H264_COUNT_AND_CLIP(x[0][1],count);\
+    H264_COUNT_AND_CLIP(x[0][2],count);\
+    H264_COUNT_AND_CLIP(x[0][3],count);\
+    H264_COUNT_AND_CLIP(x[1][0],count);\
+    H264_COUNT_AND_CLIP(x[1][1],count);\
+    H264_COUNT_AND_CLIP(x[1][2],count);\
+    H264_COUNT_AND_CLIP(x[1][3],count);\
+    H264_COUNT_AND_CLIP(x[2][0],count);\
+    H264_COUNT_AND_CLIP(x[2][1],count);\
+    H264_COUNT_AND_CLIP(x[2][2],count);\
+    H264_COUNT_AND_CLIP(x[2][3],count);\
+    H264_COUNT_AND_CLIP(x[3][0],count);\
+    H264_COUNT_AND_CLIP(x[3][1],count);\
+    H264_COUNT_AND_CLIP(x[3][2],count);\
+    H264_COUNT_AND_CLIP(x[3][3],count);\
+}
+
+static const int8_t zigzagx[16] = { 0,1,0,0,1,2,3,2,1,0,1,2,3,3,2,3 };
+static const int8_t zigzagy[16] = { 0,0,1,2,1,0,0,1,2,3,3,2,1,2,3,3 };
+
+#define H264_ENCODE_INTRA16X16_RESIDUAL_COEFFICIENTS(PLANE) \
+        coefficients[0] = PLANE[0][0]; \
+        coefficients[1] = PLANE[0][1]; \
+        coefficients[2] = PLANE[1][0]; \
+        coefficients[3] = PLANE[1][1]; \
+        h264cavlc_encode(b,coefficients,4,-1,-1,1); // nA and nB are not used in this case
+
+static void ff_h264_encode_intra16x16_residual(PutBitContext *b,DCTELEM YD[4][4],DCTELEM UD[2][2],DCTELEM VD[2][2],
+                         Residual *residual, int lumamode, int chromamode, MacroBlock *mb)
+{
+    int lumaACcount = 0;
+    int chromaDCcount = 0;
+    int chromaACcount = 0;
+    int CodedBlockPatternChroma = 0;
+    int CodedBlockPatternLuma = 0;
+    int x,y,i,j;
+    int16_t coefficients[256];
+    int nA,nB;
+
+
+    for (y = 0 ; y < 4 ; y++)
+        for (x = 0 ; x < 4 ; x++)
+            H264_COUNT_AND_CLIP_SUBBLOCK(residual->part4x4Y[y][x],lumaACcount);
+
+    for (y = 0 ; y < 2 ; y++)
+    {
+        for (x = 0 ; x < 2 ; x++)
+        {
+            H264_COUNT_AND_CLIP_SUBBLOCK(residual->part4x4U[y][x],chromaACcount);
+            H264_COUNT_AND_CLIP_SUBBLOCK(residual->part4x4V[y][x],chromaACcount);
+        }
+    }
+
+    for (y = 0 ; y < 2 ; y++)
+    {
+        for (x = 0 ; x < 2 ; x++)
+        {
+            H264_COUNT_AND_CLIP(UD[y][x],chromaDCcount);
+            H264_COUNT_AND_CLIP(VD[y][x],chromaDCcount);
+        }
+    }
+
+    for (y = 0 ; y < 4 ; y++)
+        for (x = 0 ; x < 4 ; x++)
+            clip(YD[y][x], -2047, 2047);
+
+    if(chromaACcount)
+        CodedBlockPatternChroma= 2;
+    else
+        CodedBlockPatternChroma= !!chromaDCcount;
+
+    if (lumaACcount == 0)
+        CodedBlockPatternLuma = 0;
+    else
+        CodedBlockPatternLuma = 1; // actually it is 15 in the ITU spec, but I'd like to use it as an array index
+
+    set_ue_golomb(b, mbtype_map[lumamode][CodedBlockPatternChroma][CodedBlockPatternLuma]); // mb_type
+    set_ue_golomb(b, chromamode); // intra_chroma_pred_mode
+    set_se_golomb(b, 0); // mb_qp_delta
+
+    // encode luma DC coefficients
+
+    ff_h264_neighbour_count_nonzero(mb,NEIGHBOUR_SUBTYPE_Y,0,0,&nA,&nB);
+    for (i = 0 ; i < 16 ; i++)
+        coefficients[i] = YD[zigzagy[i]][zigzagx[i]];
+    h264cavlc_encode(b,coefficients,16,nA,nB,0);
+
+    if (CodedBlockPatternLuma > 0)
+    {
+        for (j = 0 ; j < 4 ; j++)
+        {
+            int X = (j&1) << 1;
+            int Y = j&2;
+
+            for (i = 0 ; i < 4 ; i++)
+            {
+                int x = (i&1)+X;
+                int y = (i>>1)+Y;
+
+                int k;
+
+                for (k = 0 ; k < 15 ; k++)
+                    coefficients[k] = residual->part4x4Y[y][x][zigzagy[k+1]][zigzagx[k+1]];
+                ff_h264_neighbour_count_nonzero(mb,NEIGHBOUR_SUBTYPE_Y,x,y,&nA,&nB);
+                mb->Y_nonzero[y][x] = h264cavlc_encode(b,coefficients,15,nA,nB,0);
+            }
+        }
+    }
+    else
+        memset(mb->Y_nonzero, 0, sizeof(mb->Y_nonzero));
+
+    if (CodedBlockPatternChroma == 0)
+    {
+        memset(mb->U_nonzero, 0, sizeof(mb->U_nonzero));
+        memset(mb->V_nonzero, 0, sizeof(mb->V_nonzero));
+        return;
+    }
+
+    if (CodedBlockPatternChroma != 0)
+    {
+        H264_ENCODE_INTRA16X16_RESIDUAL_COEFFICIENTS(UD);
+        H264_ENCODE_INTRA16X16_RESIDUAL_COEFFICIENTS(VD);
+    }
+
+    if (CodedBlockPatternChroma == 2)
+    {
+        for (i = 0 ; i < 4 ; i++)
+        {
+            int x = i&1;
+            int y = i>>1;
+
+            int k;
+
+            for (k = 0 ; k < 15 ; k++)
+                coefficients[k] = residual->part4x4U[y][x][zigzagy[k+1]][zigzagx[k+1]];
+            ff_h264_neighbour_count_nonzero(mb,NEIGHBOUR_SUBTYPE_U,x,y,&nA,&nB);
+            mb->U_nonzero[y][x] = h264cavlc_encode(b,coefficients,15,nA,nB,0);
+        }
+
+        for (i = 0 ; i < 4 ; i++)
+        {
+            int x = i&1;
+            int y = i>>1;
+
+            int k;
+
+            for (k = 0 ; k < 15 ; k++)
+                coefficients[k] = residual->part4x4V[y][x][zigzagy[k+1]][zigzagx[k+1]];
+            ff_h264_neighbour_count_nonzero(mb,NEIGHBOUR_SUBTYPE_V,x,y,&nA,&nB);
+            mb->V_nonzero[y][x] = h264cavlc_encode(b,coefficients,15,nA,nB,0);
+        }
+    }
+    else
+    {
+        memset(mb->U_nonzero, 0, sizeof(mb->U_nonzero));
+        memset(mb->V_nonzero, 0, sizeof(mb->V_nonzero));
+    }
+}
+
+static void ff_h264_encode_Intra_16x16(H264Context *t, MacroBlock *targetmb, PutBitContext *b,
+                                 MacroBlock *destmb)
+{
+    int x,y;
+    int w,h,w2,h2;
+    DCTELEM YD[4][4];
+    DCTELEM UD[2][2];
+    DCTELEM VD[2][2];
+    int qPI;
+    int QPc;
+    int QPy = t->QP;
+    int lumapredmode = 2;
+    int chromapredmode = 0;
+    int leftavail = 0;
+    int topavail = 0;
+
+    qPI = t->QP + t->chroma_qp_index_offset;
+    qPI = clip(qPI, 0, 51);
+    QPc = chroma_qp[qPI];
+
+    w = targetmb->Y_width;
+    h = targetmb->Y_height;
+    w2 = w>>1;
+    h2 = h>>1;
+
+    if (destmb->leftblock != NULL && destmb->leftblock->available)
+        leftavail = 1;
+    if (destmb->topblock != NULL && destmb->topblock->available)
+        topavail = 1;
+
+    // TODO: use better strategy to determine intra16x16 encoding mode
+
+    if (leftavail)
+    {
+        MacroBlock *srcleft = destmb->leftblock;
+
+        if (topavail && w == 16 && h == 16 && srcleft->topblock != 0 && srcleft->topblock->available)
+        {
+            // Plane prediction
+            pred16x16_plane_c(destmb->Y[0], t->refframe_width);
+            pred8x8_plane_c(destmb->U[0], t->refframe_width>>1);
+            pred8x8_plane_c(destmb->V[0], t->refframe_width>>1);
+            lumapredmode = PLANE_PRED8x8;
+            chromapredmode = PLANE_PRED8x8;
+        }
+        else
+        {
+            // Horizontal prediction
+            pred16x16_horizontal_c(destmb->Y[0], t->refframe_width);
+            pred8x8_horizontal_c(destmb->U[0], t->refframe_width>>1);
+            pred8x8_horizontal_c(destmb->V[0], t->refframe_width>>1);
+            lumapredmode = HOR_PRED8x8;
+            chromapredmode = HOR_PRED8x8;
+        }
+    }
+    else // no left neighbour
+    {
+        if (topavail)
+        {
+            // Vertical prediction
+            pred16x16_vertical_c(destmb->Y[0], t->refframe_width);
+            pred8x8_vertical_c(destmb->U[0], t->refframe_width>>1);
+            pred8x8_vertical_c(destmb->V[0], t->refframe_width>>1);
+            lumapredmode = VERT_PRED;
+            chromapredmode = VERT_PRED8x8;
+        }
+        else // nothing available, encode a standard DC block
+        {
+            pred16x16_128_dc_c(destmb->Y[0], t->refframe_width);
+            pred8x8_128_dc_c(destmb->U[0], t->refframe_width>>1);
+            pred8x8_128_dc_c(destmb->V[0], t->refframe_width>>1);
+            lumapredmode = DC_PRED;
+            chromapredmode = DC_PRED8x8;
+        }
+    }
+
+    H264_COPY_16X16BLOCK(t->residual.part4x4Y,(int16_t)targetmb->Y,(int16_t)destmb->Y);
+    H264_COPY_8X8BLOCK(t->residual.part4x4U,(int16_t)targetmb->U,(int16_t)destmb->U);
+    H264_COPY_8X8BLOCK(t->residual.part4x4V,(int16_t)targetmb->V,(int16_t)destmb->V);
+
+    // Transform residual: DCT
+
+    for (y = 0 ; y < 4 ; y++)
+    {
+        for (x = 0 ; x < 4 ; x++)
+        {
+            t->dspcontext.h264_transform_dct_quant(t->residual.part4x4Y[y][x],QPy,1);
+        }
+    }
+    for (y = 0 ; y < 2 ; y++)
+    {
+        for (x = 0 ; x < 2 ; x++)
+        {
+            t->dspcontext.h264_transform_dct_quant(t->residual.part4x4U[y][x],QPc,1);
+            t->dspcontext.h264_transform_dct_quant(t->residual.part4x4V[y][x],QPc,1);
+        }
+    }
+
+    // Hadamard
+
+    // For luma
+    for (y = 0 ; y < 4 ; y++)
+        for (x = 0 ; x < 4 ; x++)
+            YD[y][x] = t->residual.part4x4Y[y][x][0][0];
+
+    t->dspcontext.h264_hadamard_mult4x4(YD);
+    t->dspcontext.h264_hadamard_quant_4x4(YD,QPy);
+
+    // For U
+    for (y = 0 ; y < 2 ; y++)
+        for (x = 0 ; x < 2 ; x++)
+            UD[y][x] = t->residual.part4x4U[y][x][0][0];
+    ff_h264_hadamard_mult_2x2(UD);
+    t->dspcontext.h264_hadamard_quant_2x2(UD,QPc);
+
+    // For V
+    for (y = 0 ; y < 2 ; y++)
+        for (x = 0 ; x < 2 ; x++)
+            VD[y][x] = t->residual.part4x4V[y][x][0][0];
+    ff_h264_hadamard_mult_2x2(VD);
+    t->dspcontext.h264_hadamard_quant_2x2(VD,QPc);
+    // Encode macroblock
+
+    ff_h264_encode_intra16x16_residual(b,YD,UD,VD,&(t->residual),lumapredmode,chromapredmode,destmb);
+
+    // Inverse hadamard
+
+    // For luma
+    t->dspcontext.h264_hadamard_mult4x4(YD);
+    t->dspcontext.h264_hadamard_invquant_4x4(YD,QPy);
+    for (y = 0 ; y < 4 ; y++)
+        for (x = 0 ; x < 4 ; x++)
+            t->residual.part4x4Y[y][x][0][0] = YD[y][x];
+
+    // For U
+    ff_h264_hadamard_mult_2x2(UD);
+    ff_h264_hadamard_invquant_2x2(UD,QPc);
+    for (y = 0 ; y < 2 ; y++)
+        for (x = 0 ; x < 2 ; x++)
+            t->residual.part4x4U[y][x][0][0] = UD[y][x];
+    // For V
+    ff_h264_hadamard_mult_2x2(VD);
+    ff_h264_hadamard_invquant_2x2(VD,QPc);
+    for (y = 0 ; y < 2 ; y++)
+        for (x = 0 ; x < 2 ; x++)
+            t->residual.part4x4V[y][x][0][0] = VD[y][x];
+
+    // Inverse DCT and add
+
+    for (y = 0 ; y < 4 ; y++)
+    {
+        for (x = 0 ; x < 4 ; x++)
+        {
+            t->dspcontext.h264_transform_inverse_quant_dct_add(t->residual.part4x4Y[y][x],QPy,1,&(destmb->Y[y*4][x*4]),t->Y_stride);
+        }
+    }
+    for (y = 0 ; y < 2 ; y++)
+    {
+        for (x = 0 ; x < 2 ; x++)
+        {
+            t->dspcontext.h264_transform_inverse_quant_dct_add(t->residual.part4x4U[y][x],QPc,1,&(destmb->U[y*4][x*4]),t->U_stride);
+            t->dspcontext.h264_transform_inverse_quant_dct_add(t->residual.part4x4V[y][x],QPc,1,&(destmb->V[y*4][x*4]),t->V_stride);
+        }
+    }
+
+    destmb->available = 1;
+}
+
+#define H264_CODEDBLOCKPATTERN_4X4CHECK(a,b) \
+    for (y = 0 ; !done && y < 4 ; y++)\
+        for (x = 0 ; !done && x < 4 ; x++)\
+            if (residual->part4x4Y[a][b][y][x] != 0) \
+                done = 1;
+#define H264_CODEDBLOCKPATTERN_8X8CHECK(i,j,shift) \
+    done = 0;\
+    H264_CODEDBLOCKPATTERN_4X4CHECK(i+0,j+0)\
+    H264_CODEDBLOCKPATTERN_4X4CHECK(i+0,j+1)\
+    H264_CODEDBLOCKPATTERN_4X4CHECK(i+1,j+0)\
+    H264_CODEDBLOCKPATTERN_4X4CHECK(i+1,j+1)\
+    if (done)\
+        CodedBlockPatternLuma |= (1 << shift);
+
+static void ff_h264_encode_inter16x16_residual(H264Context *t, PutBitContext *b,int mv_x,int mv_y,int mv_x2,int mv_y2,
+                         Residual *residual,
+                         DCTELEM UD[2][2],DCTELEM VD[2][2],int pred_frame_index,MacroBlock *mb,
+                         int last_macroblock)
+{
+    static const int8_t me_map[] = { 0, 2, 3, 7, 4, 8,17,13, 5,18, 9,14,10,15,16,
+                             11, 1,32,33,36,34,37,44,40,35,45,38,41,39,42,
+                     43,19, 6,24,25,20,26,21,46,28,27,47,22,29,23,
+                     30,31,12};
+    int coded_block_pattern;
+    int CodedBlockPatternLuma;
+    int CodedBlockPatternChroma;
+    int16_t coefficients[256];
+    int x,y,i,j;
+    int done;
+    int chromaACcount;
+    int chromaDCcount;
+    int nA,nB;
+
+    // coded_block_pattern
+
+    CodedBlockPatternLuma = 0;
+
+    // first 8x8 block
+    H264_CODEDBLOCKPATTERN_8X8CHECK(0,0,0)
+
+    // second 8x8 block
+    H264_CODEDBLOCKPATTERN_8X8CHECK(0,2,1)
+
+    // third 8x8 block
+    H264_CODEDBLOCKPATTERN_8X8CHECK(2,0,2)
+
+    // fourth 8x8 block
+    H264_CODEDBLOCKPATTERN_8X8CHECK(2,2,3)
+
+    // check for too large values in luma
+    for (y = 0 ; y < 4 ; y++)
+    {
+        for (x = 0 ; x < 4 ; x++)
+        {
+            clip(residual->part4x4Y[y][x][0][0], -2047, 2047);
+            clip(residual->part4x4Y[y][x][0][1], -2047, 2047);
+            clip(residual->part4x4Y[y][x][0][2], -2047, 2047);
+            clip(residual->part4x4Y[y][x][0][3], -2047, 2047);
+            clip(residual->part4x4Y[y][x][1][0], -2047, 2047);
+            clip(residual->part4x4Y[y][x][1][1], -2047, 2047);
+            clip(residual->part4x4Y[y][x][1][2], -2047, 2047);
+            clip(residual->part4x4Y[y][x][1][3], -2047, 2047);
+            clip(residual->part4x4Y[y][x][2][0], -2047, 2047);
+            clip(residual->part4x4Y[y][x][2][1], -2047, 2047);
+            clip(residual->part4x4Y[y][x][2][2], -2047, 2047);
+            clip(residual->part4x4Y[y][x][2][3], -2047, 2047);
+            clip(residual->part4x4Y[y][x][3][0], -2047, 2047);
+            clip(residual->part4x4Y[y][x][3][1], -2047, 2047);
+            clip(residual->part4x4Y[y][x][3][2], -2047, 2047);
+            clip(residual->part4x4Y[y][x][3][3], -2047, 2047);
+        }
+    }
+
+    chromaDCcount = 0;
+    chromaACcount = 0;
+    for (y = 0 ; y < 2 ; y++)
+    {
+        for (x = 0 ; x < 2 ; x++)
+        {
+            H264_COUNT_AND_CLIP_SUBBLOCK(residual->part4x4U[y][x],chromaACcount);
+            H264_COUNT_AND_CLIP_SUBBLOCK(residual->part4x4V[y][x],chromaACcount);
+        }
+    }
+    for (y = 0 ; y < 2 ; y++)
+    {
+        for (x = 0 ; x < 2 ; x++)
+        {
+            H264_COUNT_AND_CLIP(UD[y][x],chromaDCcount);
+            H264_COUNT_AND_CLIP(VD[y][x],chromaDCcount);
+        }
+    }
+
+    if (chromaACcount)
+        CodedBlockPatternChroma= 2;
+    else
+        CodedBlockPatternChroma= !!chromaDCcount;
+
+    if (mv_x2 == 0 && mv_y2 == 0 && CodedBlockPatternChroma == 0 && CodedBlockPatternLuma == 0) // entirely predictable
+    {
+        t->mb_skip_run++;
+        if (last_macroblock)
+            set_se_golomb(b, t->mb_skip_run);
+    }
+    else
+    {
+        set_ue_golomb(b, t->mb_skip_run); // mb_skip_run
+        t->mb_skip_run = 0;
+
+        set_ue_golomb(b, 0); // mb_type = P_L0_16x16
+
+        // mb_pred()
+
+        set_se_golomb(b, mv_x);
+        set_se_golomb(b, mv_y);
+
+        coded_block_pattern = (CodedBlockPatternChroma << 4)|CodedBlockPatternLuma;
+        set_ue_golomb(b,me_map[coded_block_pattern]);
+    }
+
+    // residual()
+
+    if (CodedBlockPatternLuma == 0 && CodedBlockPatternChroma == 0) // nothing left to do
+    {
+        memset(mb->Y_nonzero, 0, sizeof(mb->Y_nonzero));
+        memset(mb->U_nonzero, 0, sizeof(mb->U_nonzero));
+        memset(mb->V_nonzero, 0, sizeof(mb->V_nonzero));
+        return;
+    }
+
+    set_se_golomb(b, 0); // mb_qp_delta
+
+    // encode luma levels
+    for (j = 0 ; j < 4 ; j++)
+    {
+        int X = (j&1) << 1;
+        int Y = j&2;
+
+        if ((CodedBlockPatternLuma >> j)&1)
+        {
+            for (i = 0 ; i < 4 ; i++)
+            {
+                int x = (i&1)+X;
+                int y = (i>>1)+Y;
+
+                int k;
+
+                for (k = 0 ; k < 16 ; k++)
+                    coefficients[k] = residual->part4x4Y[y][x][zigzagy[k]][zigzagx[k]];
+                ff_h264_neighbour_count_nonzero(mb,NEIGHBOUR_SUBTYPE_Y,x,y,&nA,&nB);
+                mb->Y_nonzero[y][x] = h264cavlc_encode(b,coefficients,16,nA,nB,0);
+            }
+        }
+        else
+        {
+            for (i = 0 ; i < 4 ; i++)
+            {
+                int x = (i&1)+X;
+                int y = (i>>1)+Y;
+                mb->Y_nonzero[y][x] = 0;
+            }
+        }
+    }
+
+    // chroma DC levels
+    if (CodedBlockPatternChroma != 0)
+    {
+        coefficients[0] = UD[0][0];
+        coefficients[1] = UD[0][1];
+        coefficients[2] = UD[1][0];
+        coefficients[3] = UD[1][1];
+        h264cavlc_encode(b,coefficients,4,-1,-1,1); // nA and nB are not used in this case
+
+        coefficients[0] = VD[0][0];
+        coefficients[1] = VD[0][1];
+        coefficients[2] = VD[1][0];
+        coefficients[3] = VD[1][1];
+        h264cavlc_encode(b,coefficients,4,-1,-1,1); // nA and nB are not used in this case
+    }
+
+    if (CodedBlockPatternChroma == 2)
+    {
+        for (i = 0 ; i < 4 ; i++)
+        {
+            int x = i&1;
+            int y = i>>1;
+
+            int k;
+
+            for (k = 0 ; k < 15 ; k++)
+                coefficients[k] = residual->part4x4U[y][x][zigzagy[k+1]][zigzagx[k+1]];
+            ff_h264_neighbour_count_nonzero(mb,NEIGHBOUR_SUBTYPE_U,x,y,&nA,&nB);
+            mb->U_nonzero[y][x] = h264cavlc_encode(b,coefficients,15,nA,nB,0);
+        }
+
+        for (i = 0 ; i < 4 ; i++)
+        {
+            int x = i&1;
+            int y = i>>1;
+
+            int k;
+
+            for (k = 0 ; k < 15 ; k++)
+                coefficients[k] = residual->part4x4V[y][x][zigzagy[k+1]][zigzagx[k+1]];
+            ff_h264_neighbour_count_nonzero(mb,NEIGHBOUR_SUBTYPE_V,x,y,&nA,&nB);
+            mb->V_nonzero[y][x] = h264cavlc_encode(b,coefficients,15,nA,nB,0);
+        }
+    }
+    else
+    {
+        int x,y;
+
+        for (y = 0 ; y < 2 ; y++)
+        {
+            for (x = 0 ; x < 2 ; x++)
+            {
+                mb->U_nonzero[y][x] = 0;
+                mb->V_nonzero[y][x] = 0;
+            }
+        }
+    }
+}
+
+static void ff_h264_predict(H264Context *t, MacroBlock *destmb, FrameInfo *refframe, int mbx, int mby, int mvx, int mvy)
+{
+    int x = mbx << 4;
+    int y = mby << 4;
+    AVPicture *refpic = &(refframe->reconstructed_picture);
+    uint8_t *data;
+    int linesize;
+    int i,j;
+    int startx,starty;
+    int w,h,w2,h2;
+    int xmod,ymod;
+
+    w = destmb->Y_width;
+    h = destmb->Y_height;
+    w2 = w>>1;
+    h2 = h>>1;
+
+    startx = x+(mvx/4);
+    starty = y+(mvy/4);
+
+    linesize = refpic->linesize[0];
+    data = refpic->data[0]+starty*linesize+startx;
+
+    for (i = 0 ; i < h ; i++)
+    {
+        for (j = 0 ; j < w ; j++)
+            destmb->Y[i][j] = data[j];
+        data += linesize;
+    }
+
+    linesize = refpic->linesize[1];
+    data = refpic->data[1]+(starty/2)*linesize+startx/2;
+
+    xmod = startx & 1;
+    ymod = starty & 1;
+
+    if (xmod == 0 && ymod == 0)
+    {
+        for (i = 0 ; i < h2 ; i++)
+        {
+            for (j = 0 ; j < w2 ; j++)
+                destmb->U[i][j] = data[j];
+            data += linesize;
+        }
+
+        linesize = refpic->linesize[2];
+        data = refpic->data[2]+(starty/2)*linesize+startx/2;
+        for (i = 0 ; i < h2 ; i++)
+        {
+            for (j = 0 ; j < w2 ; j++)
+                destmb->V[i][j] = data[j];
+            data += linesize;
+        }
+    }
+    else if (xmod == 0 && ymod != 0)
+    {
+        for (i = 0 ; i < h2 ; i++)
+        {
+            for (j = 0 ; j < w2 ; j++)
+                destmb->U[i][j] = (uint8_t)(((int)data[j]+(int)data[j+linesize]+1)/2);
+            data += linesize;
+        }
+
+        linesize = refpic->linesize[2];
+        data = refpic->data[2]+(starty/2)*linesize+startx/2;
+        for (i = 0 ; i < h2 ; i++)
+        {
+            for (j = 0 ; j < w2 ; j++)
+                destmb->V[i][j] = (uint8_t)(((int)data[j]+(int)data[j+linesize]+1)/2);
+            data += linesize;
+        }
+    }
+    else if (xmod != 0 && ymod == 0)
+    {
+        for (i = 0 ; i < h2 ; i++)
+        {
+            for (j = 0 ; j < w2 ; j++)
+                destmb->U[i][j] = (uint8_t)(((int)data[j]+(int)data[j+1]+1)/2);
+            data += linesize;
+        }
+
+        linesize = refpic->linesize[2];
+        data = refpic->data[2]+(starty/2)*linesize+startx/2;
+        for (i = 0 ; i < h2 ; i++)
+        {
+            for (j = 0 ; j < w2 ; j++)
+                destmb->V[i][j] = (uint8_t)(((int)data[j]+(int)data[j+1]+1)/2);
+            data += linesize;
+        }
+    }
+    else // xmod != 0 && ymod != 0
+    {
+        for (i = 0 ; i < h2 ; i++)
+        {
+            for (j = 0 ; j < w2 ; j++)
+                destmb->U[i][j] = (uint8_t)(((int)data[j]+(int)data[j+1]+(int)data[j+linesize+1]+(int)data[j+linesize]+2)/4);
+            data += linesize;
+        }
+
+        linesize = refpic->linesize[2];
+        data = refpic->data[2]+(starty/2)*linesize+startx/2;
+        for (i = 0 ; i < h2 ; i++)
+        {
+            for (j = 0 ; j < w2 ; j++)
+                destmb->V[i][j] = (uint8_t)(((int)data[j]+(int)data[j+1]+(int)data[j+linesize+1]+(int)data[j+linesize]+2)/4);
+            data += linesize;
+        }
+    }
+}
+
+#define MAXSEARCHSTEPS 8
+#define SEARCHWIDTH 1
+
+static void ff_h264_find_motion_vector_and_prediction(H264Context *t, MacroBlock *targetmb, FrameInfo *refframe,
+                                                 int mbx, int mby, int *mvx, int *mvy,
+                             int pred_mvx, int pred_mvy, MacroBlock *destmb)
+{
+    int x = mbx << 4;
+    int y = mby << 4;
+    int bestx, besty;
+    int curx, cury;
+    int minbitsize = 0x7FFFFFFF;
+    int QP = t->QP;
+    int done = 0;
+    int numsteps = 0;
+
+    bestx = x;
+    besty = y;
+    curx = x;
+    cury = y;
+
+    {
+        int scanx = x;
+        int scany = y;
+        int weight;
+        int xvec = -pred_mvx; // it's actually this difference which will be encoded!
+        int yvec = -pred_mvy;
+        int sae = t->dspcontext.pix_abs[0][0](0,targetmb->Y[0],
+            refframe->reconstructed_picture.data[0]    + scany * refframe->reconstructed_picture.linesize[0] + scanx,
+            refframe->reconstructed_picture.linesize[0], 16);
+        sae += t->dspcontext.pix_abs[1][0](0,targetmb->U[0],
+            refframe->reconstructed_picture.data[1]    + (scany/2) * refframe->reconstructed_picture.linesize[1] + scanx/2,
+            refframe->reconstructed_picture.linesize[1], 8);
+        sae += t->dspcontext.pix_abs[1][0](0,targetmb->V[0],
+            refframe->reconstructed_picture.data[2]    + (scany/2) * refframe->reconstructed_picture.linesize[2] + scanx/2,
+            refframe->reconstructed_picture.linesize[2], 8);
+        sae = FFMIN(sae>>4, 2047);
+        minbitsize = mv_len_table[xvec+MVTABLE_OFFSET]
+            + mv_len_table[yvec+MVTABLE_OFFSET];
+        weight = sae_codeblocksize_relation[QP>>2][sae>>8];
+        weight += (sae_codeblocksize_relation[QP>>2][FFMIN(((sae>>8)+1), 8)]
+            - sae_codeblocksize_relation[QP>>2][sae>>8] )
+            * (sae -  ((sae>>8) << 8)) / ( (FFMIN(((sae>>8)+1), 8) << 8)
+            -  ((sae>>8) << 8) );
+        minbitsize += weight;
+    }
+
+    while (!done && numsteps < MAXSEARCHSTEPS)
+    {
+        int startx = curx - SEARCHWIDTH;
+        int starty = cury - SEARCHWIDTH;
+        int stopx = curx + SEARCHWIDTH + 1;
+        int stopy = cury + SEARCHWIDTH + 1;
+        int foundbetter = 0;
+        int scanx, scany;
+
+        if (startx < 0)
+            startx = 0;
+        if (starty < 0)
+            starty = 0;
+        if (stopx > t->refframe_width - 16 + 1)
+            stopx = t->refframe_width - 16 + 1;
+        if (stopy > t->refframe_height - 16 + 1)
+            stopy = t->refframe_height -16 + 1;
+
+        for(scany = starty; scany < stopy; scany++)
+        {
+            for(scanx = startx; scanx < stopx; scanx++)
+            {
+                if (!(curx == scanx && cury == scany))
+                {
+                    int xvec = (scanx-x)*4-pred_mvx; // it's actually this difference which will be encoded!
+                    int yvec = (scany-y)*4-pred_mvy;
+                    int bitsize;
+                    int weight;
+                    int xmod = scanx%2;
+                    int ymod = scany%2;
+                    int absnum = xmod+ymod*2;
+                    int sae = t->dspcontext.pix_abs[0][0](0,targetmb->Y[0],
+                        refframe->reconstructed_picture.data[0]    + scany * refframe->reconstructed_picture.linesize[0] + scanx,
+                        refframe->reconstructed_picture.linesize[0], 16);
+
+                    sae += t->dspcontext.pix_abs[1][absnum](0,targetmb->U[0],
+                        refframe->reconstructed_picture.data[1]    + (scany/2) * refframe->reconstructed_picture.linesize[1] + scanx/2,
+                        refframe->reconstructed_picture.linesize[1], 8);
+                    sae += t->dspcontext.pix_abs[1][absnum](0,targetmb->V[0],
+                        refframe->reconstructed_picture.data[2]    + (scany/2) * refframe->reconstructed_picture.linesize[2] + scanx/2,
+                        refframe->reconstructed_picture.linesize[2], 8);
+                    sae = FFMIN(sae>>4, 2047);
+                    bitsize = mv_len_table[xvec+MVTABLE_OFFSET]
+                        + mv_len_table[yvec+MVTABLE_OFFSET];
+                    weight = sae_codeblocksize_relation[QP>>2][sae>>8];
+                    weight += (sae_codeblocksize_relation[QP>>2][FFMIN(((sae>>8)+1), 8)]
+                        - sae_codeblocksize_relation[QP>>2][sae>>8] )
+                        * (sae -  ((sae>>8) << 8)) / ( (FFMIN(((sae>>8)+1), 8) << 8)
+                        -  ((sae>>8) << 8) );
+                    bitsize += weight;
+                    if (bitsize < minbitsize)
+                    {
+                        minbitsize = bitsize;
+                        bestx = scanx;
+                        besty = scany;
+                        foundbetter = 1;
+                    }
+                }
+            }
+        }
+
+        if (foundbetter)
+        {
+            curx = bestx;
+            cury = besty;
+            numsteps++;
+        }
+        else
+            done = 1;
+    }
+    {
+        int mvx = (bestx - x) * 4;
+        int mvy = (besty - y) * 4;
+
+        ff_h264_predict(t, destmb, refframe, mbx, mby, mvx, mvy);
+    }
+
+    *mvx = (bestx - x) * 4;
+    *mvy = (besty - y) * 4;
+}
+
+// Adjust the values of mvx and mvy based on the prediction from the neighbouring macroblocks
+static void ff_h264_estimate_motion_vectors(MacroBlock *destmb, int *mvpred_x, int *mvpred_y, int *mvpred_x2, int *mvpred_y2)
+{
+    int mvAx = 0, mvAy = 0;
+    int mvBx = 0, mvBy = 0;
+    int mvCx = 0, mvCy = 0;
+    int mvDx = 0, mvDy = 0;
+    int Aavail = 0;
+    int Bavail = 0;
+    int Cavail = 0;
+    int Davail = 0;
+
+    if (destmb->leftblock != NULL && destmb->leftblock->available)
+    {
+        Aavail = 1;
+        mvAx = destmb->leftblock->mv_x;
+        mvAy = destmb->leftblock->mv_y;
+    }
+    if (destmb->topblock != NULL)
+    {
+        MacroBlock *topblock = destmb->topblock;
+
+        if (topblock->available)
+        {
+            Bavail = 1;
+            mvBx = topblock->mv_x;
+            mvBy = topblock->mv_y;
+        }
+        if (topblock->leftblock != NULL && topblock->leftblock->available)
+        {
+            Davail = 1;
+            mvDx = topblock->leftblock->mv_x;
+            mvDy = topblock->leftblock->mv_y;
+        }
+        if (topblock->rightblock != NULL && topblock->rightblock->available)
+        {
+            Cavail = 1;
+            mvCx = topblock->rightblock->mv_x;
+            mvCy = topblock->rightblock->mv_y;
+        }
+    }
+
+    if (!Cavail)
+    {
+        Cavail = Davail;
+        mvCx = mvDx;
+        mvCy = mvDy;
+    }
+
+    if (!Bavail && !Cavail && Aavail)
+    {
+        mvBx = mvAx;
+        mvBy = mvAy;
+        mvCx = mvAx;
+        mvCy = mvAy;
+    }
+
+    *mvpred_x = mid_pred(mvAx,mvBx,mvCx);
+    *mvpred_y = mid_pred(mvAy,mvBy,mvCy);
+
+    if (!Aavail || !Bavail || (Aavail && mvAx == 0 && mvAy == 0) || (Bavail && mvBx == 0 && mvBy == 0))
+    {
+        *mvpred_x2 = 0;
+        *mvpred_y2 = 0;
+    }
+    else
+    {
+        *mvpred_x2 = *mvpred_x;
+        *mvpred_y2 = *mvpred_y;
+    }
+}
+
+/*
+ *
+ * Book p. 184, spec p. 182
+ */
+static inline void ff_h264_deblocking_filter_line_luma(int p[4], int q[4], int QP, int bS)
+{
+    int delta0, delta0i, deltap1i, deltaq1i, deltap1, deltaq1;
+    int pa0, pa1, pa2, qa0, qa1, qa2;
+    int alpha, beta;
+
+    if (bS == 0)
+        return;
+
+    alpha = alpha_table[QP];
+    beta = beta_table[QP];
+
+    if (!(
+        (ABS(p[0] - q[0]) < alpha) /* (1) */
+        &&
+        (ABS(p[1] - p[0]) < beta) /* (2) */
+        &&
+        (ABS(q[1] - q[0]) < beta) /* (3) */
+        ))
+        return;
+
+    pa0 = p[0];
+    pa1 = p[1];
+    pa2 = p[2];
+    qa0 = q[0];
+    qa1 = q[1];
+    qa2 = q[2];
+
+    if (bS == 4)
+    {
+        int aP = ABS(p[2] - p[0]);
+        int aQ = ABS(q[2] - q[0]);
+
+        if (aP < beta && ABS(p[0] - q[0]) < ((alpha>>2) + 2))
+        {
+            // Luminance filtering
+            pa0 = (p[2] + 2*p[1] + 2*p[0] + 2*q[0] + q[1] + 4) >> 3; /* (20) */
+            pa1 = (p[2] + p[1] + p[0] + q[0] + 2) >> 2; /* (21) */
+            pa2 = (2*p[3] + 3*p[2] + p[1] + p[0] + q[0] + 4) >> 3; /* (22) */
+        }
+        else
+            pa0 = (2*p[1] + p[0] + q[1] + 2) >> 2; /* (23) */
+
+        if (aQ < beta && ABS(p[0] - q[0]) < ((alpha>>2) + 2))
+        {
+            // Luminance filtering
+            qa0 = (p[1] + 2*p[0] + 2*q[0] + 2*q[1] + q[2] + 4) >> 3; /* (20) */
+            qa1 = (p[0] + q[0] + q[1] + q[2] + 2) >> 2; /* (21) */
+            qa2 = (2*q[3] + 3*q[2] + q[1] + q[0] + p[0] + 4) >> 3; /* (22) */
+        }
+        else
+            qa0 = (2*q[1] + q[0] + p[1] + 2) >> 2; /* (23) */
+    }
+    else
+    {
+        int aP = ABS(p[2] - p[0]);
+        int aQ = ABS(q[2] - q[0]);
+        int c0, c1;
+
+        c0 = c1 = tc0_table[QP][bS-1];
+
+        // All conditions are met to filter this line of samples
+
+        delta0i = (((q[0] - p[0])<<2) + (p[1] - q[1]) + 4) >> 3;
+
+        if (aP < beta) /* condition (8) */
+        {
+            /* c0 should be incremented for each condition being true, 8-473 */
+            c0++;
+
+            deltap1i = (p[2] + ((p[0] + q[0] + 1) >> 1) - (p[1]<<1)) >> 1;
+            deltap1 = clip(deltap1i, -c1, c1);
+            pa1 = p[1] + deltap1;
+        }
+
+        if (aQ < beta) /* condition (9) */
+        {
+            /* c0 should be incremented for each condition being true, 8-473 */
+            c0++;
+
+            deltaq1i = (q[2] + ((p[0] + q[0] + 1) >> 1) - (q[1]<<1)) >> 1;
+            deltaq1 = clip(deltaq1i, -c1, c1);
+            qa1 = q[1] + deltaq1;
+        }
+
+        delta0 = clip(delta0i, -c0, c0);
+        pa0 = clip_uint8(p[0] + delta0);
+        qa0 = clip_uint8(q[0] - delta0);
+    }
+    p[0] = pa0;
+    p[1] = pa1;
+    p[2] = pa2;
+    q[0] = qa0;
+    q[1] = qa1;
+    q[2] = qa2;
+}
+
+static inline void ff_h264_deblocking_filter_line_chroma(int p[4], int q[4], int QP, int bS)
+{
+    int delta0i, delta0;
+    int pa0, pa1, pa2, qa0, qa1, qa2;
+    int alpha, beta;
+
+    if (bS == 0)
+        return;
+
+    alpha = alpha_table[QP];
+    beta = beta_table[QP];
+
+    if (!(
+        (ABS(p[0] - q[0]) < alpha) /* (1) */
+        &&
+        (ABS(p[1] - p[0]) < beta) /* (2) */
+        &&
+        (ABS(q[1] - q[0]) < beta) /* (3) */
+        ))
+        return;
+
+    pa0 = p[0];
+    pa1 = p[1];
+    pa2 = p[2];
+    qa0 = q[0];
+    qa1 = q[1];
+    qa2 = q[2];
+
+    if (bS == 4)
+    {
+        pa0 = ((p[1]<<1) + p[0] + q[1] + 2) >> 2; /* (23) */
+        qa0 = ((q[1]<<1) + q[0] + p[1] + 2) >> 2; /* (23) */
+    }
+    else
+    {
+        int c0, c1;
+
+        c0 = c1 = tc0_table[QP][bS-1];
+
+        // All conditions are met to filter this line of samples
+
+        delta0i = (((q[0] - p[0])<<2) + (p[1] - q[1]) + 4) >> 3;
+
+        c0++; /* p. 191, (8-474) */
+
+        delta0 = clip(delta0i, -c0, c0);
+        pa0 = clip_uint8(p[0] + delta0);
+        qa0 = clip_uint8(q[0] - delta0);
+    }
+    p[0] = pa0;
+    p[1] = pa1;
+    p[2] = pa2;
+    q[0] = qa0;
+    q[1] = qa1;
+    q[2] = qa2;
+}
+
+static void ff_h264_deblock_macroblock(MacroBlock *mb, int filter_left_edge, int filter_top_edge, int isIDR, int QPYav, int QPCav)
+{
+    int p[4],q[4];
+    int x,y;
+    int bS[4][16];
+
+    // First step is filtering of vertical edges
+
+    // first filter left edge
+    if (filter_left_edge)
+    {
+        MacroBlock *leftmb = mb->leftblock;
+
+        // first Y
+        for (y = 0 ; y < 16 ; y++)
+        {
+            if (isIDR)
+                bS[0][y] = 4;
+            else
+            {
+                if (leftmb->Y_nonzero[y>>2][3] != 0 || mb->Y_nonzero[y>>2][0] != 0)
+                    bS[0][y] = 2;
+                else
+                {
+                    if (ABS(leftmb->mv_x - mb->mv_x) >= 4 || ABS(leftmb->mv_y - mb->mv_y) >= 4)
+                        bS[0][y] = 1;
+                    else
+                        bS[0][y] = 0;
+                }
+            }
+
+            p[0] = leftmb->Y[y][15];
+            p[1] = leftmb->Y[y][14];
+            p[2] = leftmb->Y[y][13];
+            p[3] = leftmb->Y[y][12];
+            q[0] = mb->Y[y][0];
+            q[1] = mb->Y[y][1];
+            q[2] = mb->Y[y][2];
+            q[3] = mb->Y[y][3];
+
+            ff_h264_deblocking_filter_line_luma(p,q,QPYav,bS[0][y]);
+
+            leftmb->Y[y][15] = p[0];
+            leftmb->Y[y][14] = p[1];
+            leftmb->Y[y][13] = p[2];
+            mb->Y[y][0] = q[0];
+            mb->Y[y][1] = q[1];
+            mb->Y[y][2] = q[2];
+        }
+
+        // then U and V
+
+        for (y = 0 ; y < 8 ; y++)
+        {
+            p[0] = leftmb->U[y][7];
+            p[1] = leftmb->U[y][6];
+            p[2] = leftmb->U[y][5];
+            p[3] = leftmb->U[y][4];
+            q[0] = mb->U[y][0];
+            q[1] = mb->U[y][1];
+            q[2] = mb->U[y][2];
+            q[3] = mb->U[y][3];
+
+            ff_h264_deblocking_filter_line_chroma(p,q,QPCav,bS[0][y<<1]);
+
+            leftmb->U[y][7] = p[0];
+            leftmb->U[y][6] = p[1];
+            leftmb->U[y][5] = p[2];
+            mb->U[y][0] = q[0];
+            mb->U[y][1] = q[1];
+            mb->U[y][2] = q[2];
+
+            p[0] = leftmb->V[y][7];
+            p[1] = leftmb->V[y][6];
+            p[2] = leftmb->V[y][5];
+            p[3] = leftmb->V[y][4];
+            q[0] = mb->V[y][0];
+            q[1] = mb->V[y][1];
+            q[2] = mb->V[y][2];
+            q[3] = mb->V[y][3];
+
+            ff_h264_deblocking_filter_line_chroma(p,q,QPCav,bS[0][y<<1]);
+
+            leftmb->V[y][7] = p[0];
+            leftmb->V[y][6] = p[1];
+            leftmb->V[y][5] = p[2];
+            mb->V[y][0] = q[0];
+            mb->V[y][1] = q[1];
+            mb->V[y][2] = q[2];
+        }
+    }
+
+    // then the internal vertical edges
+
+    for (x = 4 ; x < 16 ; x += 4)
+    {
+        int xidx = x >> 2;
+
+        // first Y
+        for (y = 0 ; y < 16 ; y++)
+        {
+            if (isIDR)
+                bS[xidx][y] = 3;
+            else
+            {
+                if (mb->Y_nonzero[y>>2][(x>>2)-1] != 0 || mb->Y_nonzero[y>>2][x>>2] != 0)
+                    bS[xidx][y] = 2;
+                else
+                {
+                    // one motion vector per 16x16 block, so there will be no difference
+                    // between the motion vectors
+                    bS[xidx][y] = 0;
+                }
+            }
+
+            p[0] = mb->Y[y][x-1];
+            p[1] = mb->Y[y][x-2];
+            p[2] = mb->Y[y][x-3];
+            p[3] = mb->Y[y][x-4];
+            q[0] = mb->Y[y][x+0];
+            q[1] = mb->Y[y][x+1];
+            q[2] = mb->Y[y][x+2];
+            q[3] = mb->Y[y][x+3];
+
+            ff_h264_deblocking_filter_line_luma(p,q,QPYav,bS[xidx][y]);
+
+            mb->Y[y][x-1] = p[0];
+            mb->Y[y][x-2] = p[1];
+            mb->Y[y][x-3] = p[2];
+            mb->Y[y][x+0] = q[0];
+            mb->Y[y][x+1] = q[1];
+            mb->Y[y][x+2] = q[2];
+        }
+    }
+
+    // then U and V
+
+    for (y = 0 ; y < 8 ; y++)
+    {
+        p[0] = mb->U[y][3];
+        p[1] = mb->U[y][2];
+        p[2] = mb->U[y][1];
+        p[3] = mb->U[y][0];
+        q[0] = mb->U[y][4];
+        q[1] = mb->U[y][5];
+        q[2] = mb->U[y][6];
+        q[3] = mb->U[y][7];
+
+        ff_h264_deblocking_filter_line_chroma(p,q,QPCav,bS[2][y<<1]);
+
+        mb->U[y][3] = p[0];
+        mb->U[y][2] = p[1];
+        mb->U[y][1] = p[2];
+        mb->U[y][4] = q[0];
+        mb->U[y][5] = q[1];
+        mb->U[y][6] = q[2];
+
+        p[0] = mb->V[y][3];
+        p[1] = mb->V[y][2];
+        p[2] = mb->V[y][1];
+        p[3] = mb->V[y][0];
+        q[0] = mb->V[y][4];
+        q[1] = mb->V[y][5];
+        q[2] = mb->V[y][6];
+        q[3] = mb->V[y][7];
+
+        ff_h264_deblocking_filter_line_chroma(p,q,QPCav,bS[2][y<<1]);
+
+        mb->V[y][3] = p[0];
+        mb->V[y][2] = p[1];
+        mb->V[y][1] = p[2];
+        mb->V[y][4] = q[0];
+        mb->V[y][5] = q[1];
+        mb->V[y][6] = q[2];
+    }
+
+    // Next step is filtering of horizontal edges
+
+    // first, filter top edge
+
+    if (filter_top_edge)
+    {
+        MacroBlock *topmb = mb->topblock;
+
+        // first Y
+        for (x = 0 ; x < 16 ; x++)
+        {
+            if (isIDR)
+                bS[0][x] = 4;
+            else
+            {
+                if (topmb->Y_nonzero[3][x>>2] != 0 || mb->Y_nonzero[0][x>>2] != 0)
+                    bS[0][x] = 2;
+                else
+                {
+                    if (ABS(topmb->mv_x - mb->mv_x) >= 4 || ABS(topmb->mv_y - mb->mv_y) >= 4)
+                        bS[0][x] = 1;
+                    else
+                        bS[0][x] = 0;
+                }
+            }
+
+            p[0] = topmb->Y[15][x];
+            p[1] = topmb->Y[14][x];
+            p[2] = topmb->Y[13][x];
+            p[3] = topmb->Y[12][x];
+            q[0] = mb->Y[0][x];
+            q[1] = mb->Y[1][x];
+            q[2] = mb->Y[2][x];
+            q[3] = mb->Y[3][x];
+
+            ff_h264_deblocking_filter_line_luma(p,q,QPYav,bS[0][x]);
+
+            topmb->Y[15][x] = p[0];
+            topmb->Y[14][x] = p[1];
+            topmb->Y[13][x] = p[2];
+            mb->Y[0][x] = q[0];
+            mb->Y[1][x] = q[1];
+            mb->Y[2][x] = q[2];
+        }
+
+        // then U and V
+
+        for (x = 0 ; x < 8 ; x++)
+        {
+            p[0] = topmb->U[7][x];
+            p[1] = topmb->U[6][x];
+            p[2] = topmb->U[5][x];
+            p[3] = topmb->U[4][x];
+            q[0] = mb->U[0][x];
+            q[1] = mb->U[1][x];
+            q[2] = mb->U[2][x];
+            q[3] = mb->U[3][x];
+
+            ff_h264_deblocking_filter_line_chroma(p,q,QPCav,bS[0][x<<1]);
+
+            topmb->U[7][x] = p[0];
+            topmb->U[6][x] = p[1];
+            topmb->U[5][x] = p[2];
+            mb->U[0][x] = q[0];
+            mb->U[1][x] = q[1];
+            mb->U[2][x] = q[2];
+
+            p[0] = topmb->V[7][x];
+            p[1] = topmb->V[6][x];
+            p[2] = topmb->V[5][x];
+            p[3] = topmb->V[4][x];
+            q[0] = mb->V[0][x];
+            q[1] = mb->V[1][x];
+            q[2] = mb->V[2][x];
+            q[3] = mb->V[3][x];
+
+            ff_h264_deblocking_filter_line_chroma(p,q,QPCav,bS[0][x<<1]);
+
+            topmb->V[7][x] = p[0];
+            topmb->V[6][x] = p[1];
+            topmb->V[5][x] = p[2];
+            mb->V[0][x] = q[0];
+            mb->V[1][x] = q[1];
+            mb->V[2][x] = q[2];
+        }
+    }
+
+    // then the internal horizontal edges
+
+    for (y = 4 ; y < 16 ; y += 4)
+    {
+        int yidx = y >> 2;
+
+        // first Y
+        for (x = 0 ; x < 16 ; x++)
+        {
+            if (isIDR)
+                bS[yidx][x] = 3;
+            else
+            {
+                if (mb->Y_nonzero[(y>>2)-1][(x>>2)] != 0 || mb->Y_nonzero[y>>2][x>>2] != 0)
+                    bS[yidx][x] = 2;
+                else
+                {
+                    // one motion vector per 16x16 block, so there will be no difference
+                    // between the motion vectors
+                    bS[yidx][x] = 0;
+                }
+            }
+
+            p[0] = mb->Y[y-1][x];
+            p[1] = mb->Y[y-2][x];
+            p[2] = mb->Y[y-3][x];
+            p[3] = mb->Y[y-4][x];
+            q[0] = mb->Y[y+0][x];
+            q[1] = mb->Y[y+1][x];
+            q[2] = mb->Y[y+2][x];
+            q[3] = mb->Y[y+3][x];
+
+            ff_h264_deblocking_filter_line_luma(p,q,QPYav,bS[yidx][x]);
+
+            mb->Y[y-1][x] = p[0];
+            mb->Y[y-2][x] = p[1];
+            mb->Y[y-3][x] = p[2];
+            mb->Y[y+0][x] = q[0];
+            mb->Y[y+1][x] = q[1];
+            mb->Y[y+2][x] = q[2];
+        }
+    }
+
+    // then U and V
+
+    for (x = 0 ; x < 8 ; x++)
+    {
+        p[0] = mb->U[3][x];
+        p[1] = mb->U[2][x];
+        p[2] = mb->U[1][x];
+        p[3] = mb->U[0][x];
+        q[0] = mb->U[4][x];
+        q[1] = mb->U[5][x];
+        q[2] = mb->U[6][x];
+        q[3] = mb->U[7][x];
+
+        ff_h264_deblocking_filter_line_chroma(p,q,QPCav,bS[2][x<<1]);
+
+        mb->U[3][x] = p[0];
+        mb->U[2][x] = p[1];
+        mb->U[1][x] = p[2];
+        mb->U[4][x] = q[0];
+        mb->U[5][x] = q[1];
+        mb->U[6][x] = q[2];
+
+        p[0] = mb->V[3][x];
+        p[1] = mb->V[2][x];
+        p[2] = mb->V[1][x];
+        p[3] = mb->V[0][x];
+        q[0] = mb->V[4][x];
+        q[1] = mb->V[5][x];
+        q[2] = mb->V[6][x];
+        q[3] = mb->V[7][x];
+
+        ff_h264_deblocking_filter_line_chroma(p,q,QPCav,bS[2][x<<1]);
+
+        mb->V[3][x] = p[0];
+        mb->V[2][x] = p[1];
+        mb->V[1][x] = p[2];
+        mb->V[4][x] = q[0];
+        mb->V[5][x] = q[1];
+        mb->V[6][x] = q[2];
+    }
+}
+
+static void ff_h264_deblock(H264Context *t, FrameInfo *frame, int isIDR, int QPYav, int QPCav)
+{
+    int y,x;
+    int w,h;
+
+    w = t->mb_width;
+    h = t->mb_height;
+
+    // for the top row, only vertical filtering is done at the edges, for the top-left block, no filtering is
+    // done at the edge
+
+    ff_h264_deblock_macroblock(&(frame->reconstructed_mb_map[0][0]),0,0,isIDR,QPYav,QPCav);
+    for (x = 1 ; x < w ; x++)
+        ff_h264_deblock_macroblock(&(frame->reconstructed_mb_map[0][x]),1,0,isIDR,QPYav,QPCav);
+    for (y = 1 ; y < h ; y++)
+    {
+        ff_h264_deblock_macroblock(&(frame->reconstructed_mb_map[y][0]),0,1,isIDR,QPYav,QPCav);
+        for (x = 1 ; x < w ; x++)
+            ff_h264_deblock_macroblock(&(frame->reconstructed_mb_map[y][x]),1,1,isIDR,QPYav,QPCav);
+    }
+}
+
+static void ff_h264_encode_Inter_16x16(H264Context *t, MacroBlock *targetmb, PutBitContext *b,
+                                 MacroBlock *destmb, FrameInfo **previous_frames,
+                     int num_prev_frames, int mbx, int mby)
+{
+    int y,h,x,w;
+    int w2,h2;
+    int qPI;
+    int QPc;
+    int QPy = t->QP;
+    int16_t UD[2][2];
+    int16_t VD[2][2];
+    int mvx = 0;
+    int mvy = 0;
+    int pred_mvx = 0;
+    int pred_mvy = 0;
+    int pred_mvx2 = 0;
+    int pred_mvy2 = 0;
+
+    qPI = t->QP + t->chroma_qp_index_offset;
+    qPI = clip(qPI, 0, 51);
+    QPc = chroma_qp[qPI];
+
+    w = targetmb->Y_width;
+    h = targetmb->Y_height;
+    w2 = w>>1;
+    h2 = h>>1;
+
+    // Find motion vector and prediction
+
+    ff_h264_estimate_motion_vectors(destmb, &pred_mvx, &pred_mvy, &pred_mvx2, &pred_mvy2);
+    ff_h264_find_motion_vector_and_prediction(t, targetmb, previous_frames[0], mbx, mby, &mvx, &mvy,
+                                         pred_mvx, pred_mvy, destmb);
+
+    // Calculate residual
+
+    H264_COPY_16X16BLOCK(t->residual.part4x4Y,(int16_t)targetmb->Y,(int16_t)destmb->Y);
+    H264_COPY_8X8BLOCK(t->residual.part4x4U,(int16_t)targetmb->U,(int16_t)destmb->U);
+    H264_COPY_8X8BLOCK(t->residual.part4x4V,(int16_t)targetmb->V,(int16_t)destmb->V);
+
+    // Transform residual: DCT
+
+    for (y = 0 ; y < 4 ; y++)
+    {
+        for (x = 0 ; x < 4 ; x++)
+        {
+            t->dspcontext.h264_transform_dct_quant(t->residual.part4x4Y[y][x],QPy,0);
+        }
+    }
+    for (y = 0 ; y < 2 ; y++)
+    {
+        for (x = 0 ; x < 2 ; x++)
+        {
+            t->dspcontext.h264_transform_dct_quant(t->residual.part4x4U[y][x],QPc,1);
+            t->dspcontext.h264_transform_dct_quant(t->residual.part4x4V[y][x],QPc,1);
+        }
+    }
+    // For U
+    for (y = 0 ; y < 2 ; y++)
+        for (x = 0 ; x < 2 ; x++)
+            UD[y][x] = t->residual.part4x4U[y][x][0][0];
+    ff_h264_hadamard_mult_2x2(UD);
+    t->dspcontext.h264_hadamard_quant_2x2(UD, QPc);
+
+    // For V
+    for (y = 0 ; y < 2 ; y++)
+        for (x = 0 ; x < 2 ; x++)
+            VD[y][x] = t->residual.part4x4V[y][x][0][0];
+    ff_h264_hadamard_mult_2x2(VD);
+    t->dspcontext.h264_hadamard_quant_2x2(VD,QPc);
+
+    // Encode motion vectors, residual, ...
+
+    destmb->mv_x = mvx;
+    destmb->mv_y = mvy;
+
+    ff_h264_encode_inter16x16_residual(t, b, mvx-pred_mvx, mvy-pred_mvy, mvx-pred_mvx2, mvy-pred_mvy2,
+                                  &(t->residual), UD, VD, 0, destmb, (mbx == t->mb_width-1 && mby == t->mb_height-1));
+
+    // Inverse hadamard
+
+    // For U
+    ff_h264_hadamard_mult_2x2(UD);
+    ff_h264_hadamard_invquant_2x2(UD,QPc);
+    for (y = 0 ; y < 2 ; y++)
+        for (x = 0 ; x < 2 ; x++)
+            t->residual.part4x4U[y][x][0][0] = UD[y][x];
+    // For V
+    ff_h264_hadamard_mult_2x2(VD);
+    ff_h264_hadamard_invquant_2x2(VD,QPc);
+    for (y = 0 ; y < 2 ; y++)
+        for (x = 0 ; x < 2 ; x++)
+            t->residual.part4x4V[y][x][0][0] = VD[y][x];
+
+    // Inverse DCT and add
+
+    for (y = 0 ; y < 4 ; y++)
+    {
+        for (x = 0 ; x < 4 ; x++)
+        {
+            t->dspcontext.h264_transform_inverse_quant_dct_add(t->residual.part4x4Y[y][x],QPy,0,&(destmb->Y[y*4][x*4]),t->Y_stride);
+        }
+    }
+    for (y = 0 ; y < 2 ; y++)
+    {
+        for (x = 0 ; x < 2 ; x++)
+        {
+            t->dspcontext.h264_transform_inverse_quant_dct_add(t->residual.part4x4U[y][x],QPc,1,&(destmb->U[y*4][x*4]),t->V_stride);
+            t->dspcontext.h264_transform_inverse_quant_dct_add(t->residual.part4x4V[y][x],QPc,1,&(destmb->V[y*4][x*4]),t->U_stride);
+        }
+    }
+
+    destmb->available = 1;
+}
+
+static void ff_h264_control_bitrate(AVCodecContext *avctx, H264Context *t)
+{
+    if (t->blocksize_history_total_milliseconds)
+    {
+        int64_t bitrate = (t->blocksize_history_sum*1000)/t->blocksize_history_total_milliseconds;
+
+        if (avctx->bit_rate > bitrate) // increase quality
+        {
+            if (t->QP > 0)
+                t->QP--;
+        }
+        else // decrease quality
+        {
+            if (t->QP < 51)
+                t->QP++;
+        }
+    }
+}
+
+static int ff_h264_encode(AVCodecContext *avctx, uint8_t *buf, int buf_size, void *data)
+{
+    H264Context *t = (H264Context *)avctx->priv_data;
+    PutBitContext b;
+    int mbx, mby;
+    uint8_t *dest;
+    int destlen, i;
+    FrameInfo *tmp;
+    int QPy, QPc, qPI, isIDR = 0;
+
+    if (t->frame_num % t->IDRcount == 0)
+        isIDR = 1;
+
+    destlen = t->bufsize;
+    dest = t->po_data0;
+
+    // Copy the input image. Macroblocks were already assigned in the initialization step
+    img_copy(&(t->input_frame_copy),(AVPicture *)data,PIX_FMT_YUV420P,t->frame_width,t->frame_height);
+
+    // reconstructed_frames[0] will be used to reconstruct the image
+    ff_h264_clear_nonzero_markers(t->reconstructed_frames[0]->reconstructed_mb_map,t->mb_width,t->mb_height);
+
+    if (isIDR)
+    {
+        // sequence parameter set rbsp
+
+        init_put_bits(&b,t->pi_data0,t->bufsize);
+
+        put_bits(&b,8,66); // profile_idc = 66 in Baseline
+        put_bits(&b,1,0); // constraint_set0_flag
+        put_bits(&b,1,0); // constraint_set1_flag
+        put_bits(&b,1,0); // constraint_set2_flag
+        put_bits(&b,1,0); // constraint_set3_flag
+        put_bits(&b,4,0); // reserved_zero_bits
+        put_bits(&b,8,40); // level_idc, p. 262, 10*level number
+
+        set_ue_golomb(&b,0); // seq_parameter_set_id
+        set_ue_golomb(&b,2); // log2_max_frame_num_minus4
+        set_ue_golomb(&b,2); // pic_order_cnt
+        set_ue_golomb(&b,16); // num_ref_frames [0, 16] (make sure we can use enough)
+
+        put_bits(&b,1,0); // gaps_in_frame_num_value_allowed_flag
+
+        set_ue_golomb(&b,t->mb_width-1); // pic_width_in_mbs_minus1
+        set_ue_golomb(&b,t->mb_height-1); // pic_height_in_map_units_minus1
+
+        put_bits(&b, 1, 1); // frame_mbs_only_flag = 1 in Baseline
+        put_bits(&b, 1, 0); // direct_8x8_inference_flag
+        put_bits(&b, 1, t->frame_cropping_flag); // frame_cropping_flag
+
+        if (t->frame_cropping_flag)
+        {
+            set_ue_golomb(&b, t->frame_crop_left_offset);
+            set_ue_golomb(&b, t->frame_crop_right_offset);
+            set_ue_golomb(&b, t->frame_crop_top_offset);
+            set_ue_golomb(&b, t->frame_crop_bottom_offset);
+        }
+
+        put_bits(&b, 1, 0); // vui_parameters_present_flag
+        put_bits(&b, 1, 1); // rbsp_stop_one_bit
+
+        dest = ff_h264_write_nal_unit(1,NAL_SPS,dest,&destlen,&b);
+
+        // Baseline: nal_unit_type not in [2,4]
+
+        // picture parameter set
+
+        init_put_bits(&b,t->pi_data0,t->bufsize);
+
+        set_ue_golomb(&b,0); // pic_parameter_set_id
+        set_ue_golomb(&b,0); // seq_parameter_set_id
+        put_bits(&b,1,0); // entropy_coding_mode             0 = CAVLC
+        put_bits(&b,1,0); // pic_order_present_flag
+        set_ue_golomb(&b,0); // num_slice_groups_minus1            Only one slice group
+        // List0 is needed for enabling P-slices
+        set_ue_golomb(&b,0); // num_ref_idx_l0_active_minus1        Using at most the previous frame for prediction
+        set_ue_golomb(&b,0); // num_ref_idx_l1_active_minus1        Definitely not using list 1 in baseline
+        put_bits(&b,1,0); // weighted_pred_flag                Is 0 in baseline
+        put_bits(&b,2,0); // weighted_bipred_idc            Is 0 in baseline
+
+        set_se_golomb(&b,t->PPS_QP-26); // pic_init_qp_minus26
+        set_se_golomb(&b,0); // pic_init_qs_minus26
+
+        set_se_golomb(&b,t->chroma_qp_index_offset); // chroma_qp_index_offset
+
+        put_bits(&b,1,0); // deblocking_filter_control_present_flag
+
+        put_bits(&b,1,0); // constrained_intra_pred_flag
+        put_bits(&b,1,0); // redundant_pic_cnt_present
+
+        put_bits(&b,1,1); // rbsp_stop_one_bit
+
+        dest = ff_h264_write_nal_unit(1,NAL_PPS,dest,&destlen,&b);
+
+    }
+
+    // IDR slice or P slice
+
+    init_put_bits(&b,t->pi_data0,t->bufsize);
+
+    // Slice header
+    set_ue_golomb(&b, 0); // first_mb_in_slice
+
+    if (isIDR)
+        set_ue_golomb(&b, 7); // slice_type
+    else
+        set_ue_golomb(&b, 5); // slice_type
+    // 0: current slice is P-slice
+    // 2: current slice is I-slice
+    // 5: current and all other slices are P-slices (0 or 5)
+    // 7: current and all other slices are I-slices (2 or 7)
+
+    set_ue_golomb(&b, 0); // pic_parameter_set_id
+    put_bits(&b, 6, t->frame_num%t->IDRcount); // frame_num
+    //put_bits(&b, 4, 0); // frame_num
+    if (isIDR)
+        set_ue_golomb(&b, t->IDR_frame_num); // idr_pic_id
+    else
+        put_bits(&b, 1, 0); // num_ref_idx_active_override_flag
+
+    // dec_ref_pic_marking() ...
+    put_bits(&b, 1, 0); // no_output_of_prior_pics_flag
+    put_bits(&b, 1, 0); // long_term_reference_flag
+    // ... dec_ref_pic_marking()
+    set_se_golomb(&b, t->QP - t->PPS_QP); // slice_qp_delta
+
+    // Slice data
+
+    if (isIDR)
+    {
+        for(mby = 0; mby < t->mb_height ; mby++)
+            for(mbx = 0 ; mbx < t->mb_width ; mbx++)
+                ff_h264_encode_Intra_16x16(t,&(t->mb_map[mby][mbx]),&b,&(t->reconstructed_frames[0]->reconstructed_mb_map[mby][mbx]));
+    }
+    else // Inter encoded frame
+    {
+        t->mb_skip_run = 0;
+
+        for(mby = 0; mby < t->mb_height ; mby++)
+            for(mbx = 0 ; mbx < t->mb_width ; mbx++)
+                ff_h264_encode_Inter_16x16(t,&(t->mb_map[mby][mbx]),&b,&(t->reconstructed_frames[0]->reconstructed_mb_map[mby][mbx]),&(t->reconstructed_frames[1]),t->framebufsize-1,mbx,mby);
+    }
+
+    QPy = t->QP;
+
+    qPI = t->QP + t->chroma_qp_index_offset;
+    qPI = clip(qPI, 0, 51);
+    QPc = chroma_qp[qPI];
+
+    ff_h264_deblock(t,t->reconstructed_frames[0],isIDR,QPy,QPc);
+
+    // Trailing bits
+
+    put_bits(&b,1,1); // rbsp_stop_one_bit
+
+    if (isIDR)
+        dest = ff_h264_write_nal_unit(1,NAL_IDR_SLICE,dest,&destlen,&b);
+    else
+        dest = ff_h264_write_nal_unit(1,NAL_SLICE,dest,&destlen,&b);
+
+    // cycle frame buffer
+
+    tmp = t->reconstructed_frames[t->framebufsize-1];
+    for (i = t->framebufsize-1 ; i > 0 ; i--)
+        t->reconstructed_frames[i] = t->reconstructed_frames[i-1];
+    t->reconstructed_frames[0] = tmp;
+
+    // copy the encoded bytes
+    memcpy(buf,t->po_data0,t->bufsize-destlen);
+
+    // update history information
+    t->blocksize_history_sum -= t->blocksize_history[t->blocksize_history_pos];
+    t->blocksize_history_sum += (t->bufsize-destlen)*8;
+    t->blocksize_history[t->blocksize_history_pos] = (t->bufsize-destlen)*8;
+
+    t->blocksize_history_pos++;
+    if (t->blocksize_history_pos == t->blocksize_history_length)
+        t->blocksize_history_pos = 0;
+    if (t->blocksize_history_num_filled < t->blocksize_history_length)
+    {
+        t->blocksize_history_num_filled++;
+        t->blocksize_history_total_milliseconds += t->milliseconds_per_frame;
+    }
+
+    if (!t->use_fixed_qp)
+        ff_h264_control_bitrate(avctx,t);
+
+    // adjust frame numbers
+    t->frame_num++;
+    if (isIDR)
+        t->IDR_frame_num++;
+    return (t->bufsize-destlen);
+}
+
+static int ff_h264_encoder_close(AVCodecContext *avctx)
+{
+    PutBitContext b;
+    H264Context *t = (H264Context *)avctx->priv_data;
+    uint8_t *dest;
+    int destlen;
+    int y,i;
+
+    destlen = t->bufsize;
+    dest = t->po_data0;
+
+    init_put_bits(&b,t->pi_data0,t->bufsize);
+
+    // write end of stream
+
+    dest = ff_h264_write_nal_unit(0,NAL_END_STREAM,dest,&destlen,&b);
+
+    *dest = 0;
+    dest++;
+    destlen--;
+
+    // clean up
+
+    avpicture_free(&t->pi);
+    avpicture_free(&t->po);
+
+    for (y = 0 ; y < t->mb_height ; y++)
+        av_free(t->mb_map[y]);
+
+    av_free(t->mb_map);
+
+    for (i = 0 ; i < t->framebufsize ; i++)
+    {
+        av_free(t->reconstructed_frames[i]->reconstructed_picture.data[0]);
+
+        for (y = 0 ; y < t->mb_height ; y++)
+            av_free(t->reconstructed_frames[i]->reconstructed_mb_map[y]);
+
+        av_free(t->reconstructed_frames[i]->reconstructed_mb_map);
+        av_free(t->reconstructed_frames[i]);
+    }
+
+    av_free(t->reconstructed_frames);
+
+    av_free(t->input_frame_copy.data[0]);
+
+    av_free(t->blocksize_history);
+
+    return 0;
+}
+
+#ifdef CONFIG_ENCODERS
+AVCodec h264_encoder = {
+    "ffh264",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_FFH264,
+    sizeof(H264Context),
+    ff_h264_encoder_init,
+    ff_h264_encode,
+    ff_h264_encoder_close,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
+};
+#endif
+
diff --git a/libavcodec/h264enc.h b/libavcodec/h264enc.h
new file mode 100644
index 0000000..c962618
--- /dev/null
+++ b/libavcodec/h264enc.h
@@ -0,0 +1,105 @@
+/*
+ * H.264 encoder
+ * Copyright (c) 2006 Expertisecentrum Digitale Media, UHasselt
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264encdata.h"
+
+/**
+ * Can contain pointers to the relevant starting points in a picture
+ */
+typedef struct MacroBlock
+{
+    uint8_t *Y[16];
+    uint8_t *U[8];
+    uint8_t *V[8];
+    int Y_width;
+    int Y_height;
+    int Y_nonzero[4][4]; ///< y,x
+    int U_nonzero[2][2];
+    int V_nonzero[2][2];
+    struct MacroBlock *leftblock,*topblock,*rightblock;
+    int available;
+    int mv_x;
+    int mv_y;
+} MacroBlock;
+
+typedef struct Residual
+{
+    int16_t part4x4Y[4][4][4][4]; ///< ypos and xpos of 4x4 part, followed by y,x of pixel
+    int16_t part4x4U[2][2][4][4]; ///< ypos and xpos of 4x4 part, followed by y,x of pixel
+    int16_t part4x4V[2][2][4][4]; ///< ypos and xpos of 4x4 part, followed by y,x of pixel
+} Residual;
+
+typedef struct FrameInfo
+{
+    AVPicture reconstructed_picture;
+    MacroBlock **reconstructed_mb_map; ///< macroblock map of reconstructed picture
+} FrameInfo;
+
+typedef struct H264Context
+{
+    uint8_t *pi_data0;
+    uint8_t *po_data0;
+    int bufsize;
+    int frame_num;
+    int IDR_frame_num;
+    MacroBlock **mb_map; ///< macroblock map for input picture
+    FrameInfo **reconstructed_frames;
+    int framebufsize; ///< length of previous array
+    int mb_width; ///< width in macroblocks
+    int mb_height; ///< height in macroblock
+    int QP;
+    int PPS_QP; //< The QP value stored in the picture parameter set
+    int chroma_qp_index_offset;
+    int IDRcount;
+    int frame_cropping_flag;
+    int frame_crop_left_offset;
+    int frame_crop_right_offset;
+    int frame_crop_top_offset;
+    int frame_crop_bottom_offset;
+    Residual residual;
+
+    MpegEncContext s;
+    AVPicture pi, po;
+
+    DSPContext dspcontext;
+    int Y_stride;
+    int U_stride;
+    int V_stride;
+
+    int frame_width;
+    int frame_height;
+    int refframe_width;
+    int refframe_height;
+
+    AVPicture input_frame_copy; ///< buffer to hold copy of input frame
+    int mb_skip_run;
+
+    int64_t *blocksize_history;
+    int blocksize_history_length;
+    int blocksize_history_total_milliseconds;
+    int milliseconds_per_frame;
+    int blocksize_history_pos;
+    int blocksize_history_num_filled;
+    int64_t blocksize_history_sum;
+
+    int use_fixed_qp;
+
+} H264Context;
+
+
diff --git a/libavcodec/h264encdata.h b/libavcodec/h264encdata.h
new file mode 100644
index 0000000..93bb647
--- /dev/null
+++ b/libavcodec/h264encdata.h
@@ -0,0 +1,110 @@
+/*
+ * H.264 encoder
+ * Copyright (c) 2006 Expertisecentrum Digitale Media, UHasselt
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define MVTABLE_OFFSET    (128*4)
+
+static const char mv_len_table[MVTABLE_OFFSET*2+1] =
+{
+    21,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,13,
+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+    13,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
+    11,11,9,9,9,9,9,9,9,9,7,7,7,7,5,
+    5,3,1,3,5,5,7,7,7,7,9,9,9,9,9,
+    9,9,9,11,11,11,11,11,11,11,11,11,11,11,11,
+    11,11,11,11,13,13,13,13,13,13,13,13,13,13,13,
+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+    13,13,13,13,13,13,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+    17,17,17,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,
+    19,19,19,19,21
+};
+
+static const int sae_codeblocksize_relation[13][8] = {
+{55,1590,1796,1865,1925,1977,2010,2021},
+{27,1328,1525,1593,1649,1699,1729,1741},
+{20,1097,1288,1353,1405,1449,1479,1485},
+{17,878,1063,1130,1177,1217,1244,1250},
+{16,667,875,945,992,1027,1051,1055},
+{18,491,704,790,835,869,891,895},
+{0,352,538,658,718,747,769,774},
+{0,243,398,500,561,643,672,683},
+{0,163,278,363,446,487,518,568},
+{0,98,197,259,313,368,425,453},
+{0,53,133,186,224,259,293,326},
+{0,22,81,126,162,188,210,231},
+{0,14,47,79,106,135,156,173}
+};
+
diff --git a/tests/Makefile b/tests/Makefile
index 835fab6..892484a 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -20,7 +20,7 @@ test-server: vsynth1/0.pgm asynth1.sw
 	@$(VPATH)/server-regression.sh $(SERVER_REFFILE) $(VPATH)/test.conf
 
 # fast regression tests for all codecs
-codectest mpeg4 mpeg ac3 snow snowll: vsynth1/0.pgm vsynth2/0.pgm asynth1.sw tiny_psnr$(EXESUF)
+codectest mpeg4 mpeg ac3 snow snowll ffh264: vsynth1/0.pgm vsynth2/0.pgm asynth1.sw tiny_psnr$(EXESUF)
 	@$(VPATH)/regression.sh $@ $(REFFILE1) vsynth1
 	@$(VPATH)/regression.sh $@ $(REFFILE2) vsynth2
 
diff --git a/tests/ffmpeg.regression.ref b/tests/ffmpeg.regression.ref
index 1aec954..3cf62d7 100644
--- a/tests/ffmpeg.regression.ref
+++ b/tests/ffmpeg.regression.ref
@@ -59,6 +59,10 @@ stddev:  8.08 PSNR:29.97 bytes:7602176
 2415378 ./data/a-h263p.avi
 28fd12ac0b168252d81df6f6e60a5d17 *./data/out.yuv
 stddev:  2.07 PSNR:41.76 bytes:7602176
+5b4fae455aa041e99d3467f2a67dd76b *./data/a-ffh264.mp4
+3014465 ./data/a-ffh264.mp4
+ba88c2a8e9bba81581dea8c1bbc03ad5 *./data/out.yuv
+stddev:  0.68 PSNR:51.39 bytes:7602176
 d84b65558cd386064ab7a126d66c4744 *./data/a-odivx.mp4
 554499 ./data/a-odivx.mp4
 57aed19df5cbada4b05991527ee72ebe *./data/out.yuv
diff --git a/tests/regression.sh b/tests/regression.sh
index 9ded777..1e1663d 100755
--- a/tests/regression.sh
+++ b/tests/regression.sh
@@ -40,6 +40,8 @@ elif [ "$1" = "snow" ] ; then
     do_snow=y
 elif [ "$1" = "snowll" ] ; then
     do_snowll=y
+elif [ "$1" = "ffh264" ] ; then
+    do_ffh264=y
 elif [ "$1" = "libavtest" ] ; then
     do_libav=y
     logfile="$datadir/libav.regression"
@@ -82,6 +84,7 @@ else
     do_svq1=y
     do_snow=y
     do_snowll=y
+    do_ffh264=y
     do_adpcm_yam=y
     do_dv=y
     do_dv50=y
@@ -297,6 +300,16 @@ do_ffmpeg $raw_dst -y -i $file -f rawvid
 fi
 
 ###################################
+if [ -n "$do_ffh264" ] ; then
+# h264 encoding
+file=${outfile}ffh264.mp4
+do_ffmpeg $file -y -qscale 10 -f pgmyuv -i $raw_src -an -vcodec ffh264 -vtag avc1 $file
+
+# h264 decoding
+do_ffmpeg $raw_dst -y -i $file -f rawvideo $raw_dst
+fi
+
+###################################
 if [ -n "$do_mpeg4" ] ; then
 # mpeg4
 file=${outfile}odivx.mp4
diff --git a/tests/rotozoom.regression.ref b/tests/rotozoom.regression.ref
index 5652f2d..1a80264 100644
--- a/tests/rotozoom.regression.ref
+++ b/tests/rotozoom.regression.ref
@@ -59,6 +59,10 @@ stddev:  5.44 PSNR:33.41 bytes:7602176
 869200 ./data/a-h263p.avi
 80fb224bebbe2e04f228da7485b905c5 *./data/out.yuv
 stddev:  1.91 PSNR:42.49 bytes:7602176
+f42c060a951e4711a1bcf91b06936021 *./data/a-ffh264.mp4
+2219112 ./data/a-ffh264.mp4
+c390adcb2747e35c4aaef623d1e17837 *./data/out.yuv
+stddev:  0.71 PSNR:51.06 bytes:7602176
 286c5a5fca0d3e17ba6ede970b8318b8 *./data/a-odivx.mp4
 120150 ./data/a-odivx.mp4
 e8c90899c32e11e7e4d1354dab0b8f28 *./data/out.yuv