[FFmpeg-devel] Awakening of FFH264

Mon Mar 22 19:33:32 CET 2010

> +inline void ff_h264_deblock_line_luma_c(int p[4], int q[4], int QP, int bS)

Pure code duplication.

> +void ff_h264_deblock_macroblock_c(MacroBlock *mb, int filter_left_edge, int
> filter_top_edge, int isIDR, int QPYav, int QPCav)

This is about 5 times longer than it needs to be.

> +
> +void ff_h264_inv_quant_dct_add_c(DCTELEM block[4][4], int QP, int
> dontscaleDC, uint8_t *dst, int stride) // y,x indexing

I don't see the reason for this unrolling.  This is premature
optimization at its worst; write an asm function if you want it
faster.  This is just code bloat.

> +    /* Not implemented yet */
> +    //if (ARCH_ARM) ff_h264dspenc_init_arm(c);
> +    //if (ARCH_PPC) ff_h264dspenc_init_ppc(c);
> +    //if (HAVE_MMX) ff_h264dspenc_init_x86(c);

I don't see why this is here.
> +    void (*h264_hadamard_mult_2x2)(DCTELEM Y[2][2]);

2x2 is faster if inlined.  It probably shouldn't be a DSP function.

> +/* Deblocking filter (p153) */
> +static const uint8_t alpha_table[52*3] = {
> +static const uint8_t beta_table[52*3] = {
> +static const uint8_t tc0_table[52*3][4] = {

Pure duplication.  These tables are already in libavcodec.

> +#define DEFAULT_QP                  30

Use the ffmpeg parameter system, don't define your own internal
reimplementations of it.

> +#define NUMBER_OF_FRAMES             2

Why not make this map to the reference frames parameter?

> +#define RATECONTROLINTERVAL        0.5

Why not use the existing mpegvideo ratecontrol instead of
reimplementing your own badly?

> +#define CHROMA_QP_INDEX_OFFSET_MAX  12
> +#define CHROMA_QP_INDEX_OFFSET_MIN -12

These values are not likely to ever change in the future, so a define
seems needless.

> +#define RTP_H264_FUA 28
> +#define H264_NAL_NRI_MASK 0x60
> +#define H264_NAL_TYPE_MASK 0x1f

These look like they should be defined closer to their use.

> +#define H264_COPY_4X4BLOCK_TRANSPOSED_PART(A, xoffset, yoffset, dest, src1,
> src2) \
> +dest[0][A] = src1[yoffset+A][xoffset+0]-src2[yoffset+A][xoffset+0]; \
> +dest[1][A] = src1[yoffset+A][xoffset+1]-src2[yoffset+A][xoffset+1]; \
> +dest[2][A] = src1[yoffset+A][xoffset+2]-src2[yoffset+A][xoffset+2]; \
> +dest[3][A] = src1[yoffset+A][xoffset+3]-src2[yoffset+A][xoffset+3];
> +
> +#define H264_COPY_4X4BLOCK_TRANSPOSED(xoffset,yoffset,dest,src1,src2) \
> +{ \
> +H264_COPY_4X4BLOCK_TRANSPOSED_PART(0, xoffset, yoffset, dest, src1, src2);
> \
> +H264_COPY_4X4BLOCK_TRANSPOSED_PART(1, xoffset, yoffset, dest, src1, src2);
> \
> +H264_COPY_4X4BLOCK_TRANSPOSED_PART(2, xoffset, yoffset, dest, src1, src2);
> \
> +H264_COPY_4X4BLOCK_TRANSPOSED_PART(3, xoffset, yoffset, dest, src1, src2);
> \
> +}
> +
> +#define H264_COPY_16X16BLOCK(dest,src1,src2) \
> +{ \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 0, 0,dest[0][0],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 4, 0,dest[0][1],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 8, 0,dest[0][2],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED(12, 0,dest[0][3],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 0, 4,dest[1][0],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 4, 4,dest[1][1],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 8, 4,dest[1][2],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED(12, 4,dest[1][3],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 0, 8,dest[2][0],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 4, 8,dest[2][1],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 8, 8,dest[2][2],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED(12, 8,dest[2][3],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 0,12,dest[3][0],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 4,12,dest[3][1],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED( 8,12,dest[3][2],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED(12,12,dest[3][3],src1,src2); \
> +}
> +
> +#define H264_COPY_8X8BLOCK(dest,src1,src2) \
> +{ \
> +H264_COPY_4X4BLOCK_TRANSPOSED(0,0,dest[0][0],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED(4,0,dest[0][1],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED(0,4,dest[1][0],src1,src2); \
> +H264_COPY_4X4BLOCK_TRANSPOSED(4,4,dest[1][1],src1,src2); \
> +}

This is really really ugly.  I don't like these stupid nested macros,
and not only are they stupid, they're slow (no write-combining).

Use DSP functions for the larger ones.

> +#define SAR_NOVAL -1
> +
> +static const struct{
> +    int width;
> +    int height;
> +    int sar;
> +} sar_table[] = {
> +    {  1,  1,  1},
> +    { 12, 11,  2},
> +    { 10, 11,  3},
> +    { 16, 11,  4},
> +    { 40, 33,  5},
> +    { 24, 11,  6},
> +    { 20, 11,  7},
> +    { 32, 11,  8},
> +    { 80, 33,  9},
> +    { 18, 11, 10},
> +    { 15, 11, 11},
> +    { 64, 33, 12},
> +    {160, 99, 13},
> +    {  0,  0, SAR_NOVAL}
> +};

These values can be uint8_t.  Define things closer to where they are used.

> +static uint8_t *rtp_buffer;

Static buffers are not allowed.

> +static void h264_rtp_packetize(AVCodecContext *avctx, uint8_t *src, int
> srcsize)
> +{
> +    H264EncContext *t = (H264EncContext *)avctx->priv_data;
> +
> +    if (srcsize < avctx->rtp_payload_size)
> +    {
> +        // NAL
> +        uint8_t *nalu_payload = src;
> +        int      tmpsize = srcsize;
> +
> +        /* skip any data before the start code prefix */
> +        while (tmpsize > 4 && (nalu_payload[0] != 0x00 || nalu_payload[1]
> != 0x00 || nalu_payload[2] != 0x01))
> +        {
> +            nalu_payload++;
> +            tmpsize--;
> +            av_log(avctx, AV_LOG_ERROR, "Skipping byte before
> startcode...\n");
> +        }
> +
> +        if (tmpsize < 4)
> +            return;
> +
> +        /* skip the start code prefix */
> +        nalu_payload += 3;
> +        tmpsize -= 3;
> +
> +        emms_c();
> +        avctx->rtp_callback(avctx, nalu_payload, tmpsize, t->mb_width *
> t->mb_height);
> +    }
> +    else
> +    {
> +        // FU-A
> +        uint8_t *nalu_payload = src;
> +        uint8_t nalu_header;
> +        int      tmpsize = srcsize;
> +        int     start=1, end=0, first=0;
> +
> +        /* skip any data before the start code prefix */
> +        while (tmpsize > 4 && (nalu_payload[0] != 0x00 || nalu_payload[1]
> != 0x00 || nalu_payload[2] != 0x01))
> +        {
> +            nalu_payload++;
> +            tmpsize--;
> +            av_log(avctx, AV_LOG_ERROR, "Skipping byte before startcode...
> (FU-A)\n");
> +        }
> +
> +        if (tmpsize < 4)
> +            return;
> +
> +        /* skip the start code prefix */
> +        nalu_payload += 3;
> +        tmpsize -= 3;
> +
> +        nalu_header = *nalu_payload;
> +
> +        nalu_payload++; /* skip the NALU header */
> +        tmpsize--;
> +
> +        while (!end)
> +        {
> +            int payload_size;
> +
> +            payload_size = FFMIN (avctx->rtp_payload_size - 2, tmpsize);
> +            if (tmpsize==payload_size)
> +                end = 1;
> +
> +            memset (rtp_buffer, 0, avctx->rtp_payload_size);
> +
> +            rtp_buffer[0] = (nalu_header & H264_NAL_NRI_MASK) |
> RTP_H264_FUA;  // FU indicator
> +            rtp_buffer[1] = (start<<7)|(end<<6)|(nalu_header &
> H264_NAL_TYPE_MASK); // FU header
> +
> +            memcpy (rtp_buffer + 2, nalu_payload + first, payload_size);
> +
> +            emms_c();
> +            avctx->rtp_callback(avctx, rtp_buffer, payload_size + 2, end ?
> t->mb_width * t->mb_height : 0);
> +
> +            tmpsize -= payload_size;
> +            first += payload_size;
> +            start=0;
> +        }
> +    }
> +}

Why is this even in libavcodec?

>  /**
>  * Write out the provided data into a NAL unit.
>  * @param nal_ref_idc NAL reference IDC
> @@ -33,36 +223,36 @@
>  * @param b2 the data which should be escaped
>  * @returns pointer to current position in the output buffer or NULL if an
> error occurred
>  */
> -static uint8_t *h264_write_nal_unit(int nal_ref_idc, int nal_unit_type,
> uint8_t *dest, int *destsize,
> -                          PutBitContext *b2)
> +static uint8_t *h264_write_nal_unit(AVCodecContext *avctx, int nal_ref_idc,
> int nal_unit_type, uint8_t *dest, int *destsize,
> +                                    PutBitContext *b2)
>  {
>     PutBitContext b;
>     int i, destpos, rbsplen, escape_count;
>     uint8_t *rbsp;
> -
> +
>     if (nal_unit_type != NAL_END_STREAM)
>         put_bits(b2,1,1); // rbsp_stop_bit
> -
> +
>     // Align b2 on a byte boundary
>     align_put_bits(b2);
>     rbsplen = put_bits_count(b2)/8;
>     flush_put_bits(b2);
>     rbsp = b2->buf;
> -
> +
>     init_put_bits(&b,dest,*destsize);
> -
> +
>     put_bits(&b,16,0);
>     put_bits(&b,16,0x01);
> -
> +
>     put_bits(&b,1,0); // forbidden zero bit
> -    put_bits(&b,2,nal_ref_idc); // nal_ref_idc
> -    put_bits(&b,5,nal_unit_type); // nal_unit_type
> -
> +    put_bits(&b,2,nal_ref_idc);
> +    put_bits(&b,5,nal_unit_type);
> +
>     flush_put_bits(&b);
> -
> +
>     destpos = 5;
>     escape_count= 0;
> -
> +

What's with all these false deltas?

> -static const uint8_t pict_type_to_golomb[7] = {-1, 2, 0, 1, -1, 4, 3};

Why do we have negative values in a uint8_t array?

> +static void h264_assign_macroblocks(AVPicture *p, MacroBlock **mb_map, int
> mb_width, int mb_height, int setneighbours)
> +{
> +    int y, x, i;
> +    int Ylinesize = p->linesize[0];
> +    int Ulinesize = p->linesize[1];
> +    int Vlinesize = p->linesize[2];
> +
> +    if (!setneighbours)
> +    {
> +        for (y = 0 ; y < mb_height ; y++)
> +        {
> +            int y16 = y << 4;
> +            int y8 = y << 3;
> +
> +            for (x = 0 ; x < mb_width ; x++)
> +            {
> +                int x16 = x << 4;
> +                int x8 = x << 3;
> +
> +                for (i = 0 ; i < 8 ; i++)
> +                {
> +                    int ypos = y8 + i;
> +                    mb_map[y][x].U[i] = p->data[1] + (x8+ypos*Ulinesize);
> +                    mb_map[y][x].V[i] = p->data[2] + (x8+ypos*Vlinesize);
> +                }
> +                for (i = 0 ; i < 16 ; i++)
> +                    mb_map[y][x].Y[i] = p->data[0] +
> (x16+(y16+i)*Ylinesize);
> +
> +                mb_map[y][x].topblock = NULL;
> +                mb_map[y][x].leftblock = NULL;
> +                mb_map[y][x].rightblock = NULL;
> +                mb_map[y][x].available = 0;
> +            }
> +        }
> +    }
> +    else
> +    {
> +        y = 0;
> +        x = 0;
> +        for (i = 0 ; i < 8 ; i++)
> +        {
> +            mb_map[y][x].U[i] = p->data[1] + ((x<<3) + ((y<<3) + i) *
> Ulinesize);
> +            mb_map[y][x].V[i] = p->data[2] + ((x<<3) + ((y<<3) + i) *
> Vlinesize);
> +        }
> +        for (i = 0 ; i < 16 ; i++)
> +            mb_map[y][x].Y[i] = p->data[0] + ((x<<4) + ((y<<4) + i) *
> Ylinesize);
> +
> +        mb_map[y][x].topblock = NULL;
> +        mb_map[y][x].leftblock = NULL;
> +
> +        if (x < mb_width-1)
> +            mb_map[y][x].rightblock = &(mb_map[y][x+1]);
> +        else
> +            mb_map[y][x].rightblock = NULL;
> +        mb_map[y][x].available = 0;
> +
> +        y = 0;
> +        for (x = 1 ; x < mb_width ; x++)
> +        {
> +            for (i = 0 ; i < 8 ; i++)
> +            {
> +                mb_map[y][x].U[i] = p->data[1] + ((x<<3) + ((y<<3) + i) *
> Ulinesize);
> +                mb_map[y][x].V[i] = p->data[2] + ((x<<3) + ((y<<3) + i) *
> Vlinesize);
> +            }
> +            for (i = 0 ; i < 16 ; i++)
> +                mb_map[y][x].Y[i] = p->data[0] + ((x<<4) + ((y<<4) + i) *
> Ylinesize);
> +
> +            mb_map[y][x].topblock = NULL;
> +            mb_map[y][x].leftblock = &(mb_map[y][x-1]);
> +            if (x < mb_width - 1)
> +                mb_map[y][x].rightblock = &(mb_map[y][x+1]);
> +            else
> +                mb_map[y][x].rightblock = NULL;
> +            mb_map[y][x].available = 0;
> +        }
> +
> +        x = 0;
> +        for (y = 1 ; y < mb_height ; y++)
> +        {
> +            for (i = 0 ; i < 8 ; i++)
> +            {
> +                mb_map[y][x].U[i] = p->data[1] + ((x<<3) + ((y<<3) + i) *
> Ulinesize);
> +                mb_map[y][x].V[i] = p->data[2] + ((x<<3) + ((y<<3) + i) *
> Vlinesize);
> +            }
> +            for (i = 0 ; i < 16 ; i++)
> +                mb_map[y][x].Y[i] = p->data[0] + ((x<<4) + ((y<<4) + i) *
> Ylinesize);
> +
> +            mb_map[y][x].topblock = &(mb_map[y-1][x]);
> +            mb_map[y][x].leftblock = NULL;
> +            if (x < mb_width - 1)
> +                mb_map[y][x].rightblock = &(mb_map[y][x+1]);
> +            else
> +                mb_map[y][x].rightblock = NULL;
> +            mb_map[y][x].available = 0;
> +        }
> +
> +        for (y = 1 ; y < mb_height ; y++)
> +        {
> +            for (x = 1 ; x < mb_width ; x++)
> +            {
> +                for (i = 0 ; i < 8 ; i++)
> +                {
> +                    mb_map[y][x].U[i] = p->data[1] + ((x<<3) + ((y<<3) + i)
> * Ulinesize);
> +                    mb_map[y][x].V[i] = p->data[2] + ((x<<3) + ((y<<3) + i)
> * Vlinesize);
> +                }
> +                for (i = 0 ; i < 16 ; i++)
> +                    mb_map[y][x].Y[i] = p->data[0] + ((x<<4) + ((y<<4) + i)
> * Ylinesize);
> +
> +                mb_map[y][x].topblock = &(mb_map[y-1][x]);
> +                mb_map[y][x].leftblock = &(mb_map[y][x-1]);
> +                if (x < mb_width - 1)
> +                    mb_map[y][x].rightblock = &(mb_map[y][x+1]);
> +                else
> +                    mb_map[y][x].rightblock = NULL;
> +                mb_map[y][x].available = 0;
> +            }
> +        }
> +    }
> +}

This seems entirely unnecessary.  It isn't as if we will ever support
useless features like FMO, so I don't see why we need to waste memory
on a fancy map of macroblock positions.  This isn't Theora.

> +static void h264_clear_nonzero_markers(MacroBlock **mb_map, int mb_width,
> int mb_height)

This function should not exist.  There is no reason that it wouldl be
needed in any H.264 encoder.

> +    put_bits(&b, 1, 0); // constraint_set1_flag
This should be set to 1.

> +    set_ue_golomb(&b, 2); // pic_order_cnt
Why in the world are you using poc type 2?

> +    set_ue_golomb(&b, 1); // num_ref_frames [0, 16]

Why is this hardcoded when the number of frames is a define?

> +    for (i = 0; sar_table[i].sar != -1; i++)
> +    {
> +        if (sar_table[i].width == t->vui.sar_width &&
> +                sar_table[i].height == t->vui.sar_height )
> +            break;
There are two spacing inconsistencies here.  Also, your syntax doesn't
match the standard K&R used by ffmpeg.

> +    put_bits(&b, 1, 0); // bitstream_restriction_flag

You should absolutely write this flag and the relevant information,
especially num_reorder_frames.  We don't need more encoders that don't
do this.

> +    set_ue_golomb(&b, 0); // num_ref_idx_l0_active_minus1   Using at most
> the previous frame for prediction

Same as what I noted above (it's a define...).

> +    /* @FIXME: Is this the correct way to determine the DPB? Or should the
> +     * reference frames be taken into account? */
> +    dpb = t->refframe_width * t->refframe_height * 1.5; /* YUV420 */

Yes, yes, they should.

> +        av_log(avctx, AV_LOG_ERROR, "No maximum bitrate specified,
> bitstream may exceed maximum bitrate level constraints\n");
> +        max_br = avctx->bit_rate;

Why is this an error?  And how are you going to enforce max bitrates
given that the level table doesn't have CPB sizes in it?

> +    switch(avctx->pix_fmt)
> +    {
> +        case PIX_FMT_YUV420P:
> +            break;
> +        default:
> +            av_log(avctx, AV_LOG_ERROR, "format not supported\n");
> +            return -1;
> +    }

Why do we need a switch statement for this?

This is needlessly large.

> +    /* If the width is not a multiple of 16, enabling cropping */
> +    if (( width % 16) !=0 )
> +    {
> +        t->frame_cropping_flag = 1;
> +        t->frame_crop_left_offset = 0;
> +        t->frame_crop_right_offset = (width%16)/2;
> +        t->mb_width++;
> +    }
> +
> +    /* If the height is not a multiple of 16, enabling cropping */
> +    if (( height % 16) !=0 )
> +    {
> +        t->frame_cropping_flag = 1;
> +        t->frame_crop_top_offset = 0;
> +        t->frame_crop_bottom_offset = (height%16)/2;
> +        t->mb_height++;
> +    }

This is incorrect.  It should be obvious why.

> +    t->mb_map = av_malloc(sizeof(MacroBlock*) * t->mb_height);
> +    for (y = 0 ; y < t->mb_height ; y++)
> +    {
> +        t->mb_map[y] = av_malloc(sizeof(MacroBlock) * t->mb_width);
> +        for (x = 0 ; x < t->mb_width ; x++)
> +        {
> +            t->mb_map[y][x].Y_width = 16;
> +            t->mb_map[y][x].Y_height = 16;
> +        }
> +    }

Nevermind the fact that the MB map is unnecessary; why is it an array
of pointers instead of just a flat array?

> +
>  t->reconstructed_frames[i]->reconstructed_mb_map[y][x].Y_width = 16;
> +
>  t->reconstructed_frames[i]->reconstructed_mb_map[y][x].Y_height = 16;

This is retarded.  We store the (constant) size of each macroblock
thousands of times per frame because...?

> +            av_log(avctx, AV_LOG_ERROR, "Level %d not sufficient for this
> encoding.\n", avctx->level);

Errors should be fatal.  If it's not fatal, it's not an error.

> t->reconstructed_frames[0]->reconstructed_picture.linesize[0];
> +    t->U_stride =
> t->reconstructed_frames[0]->reconstructed_picture.linesize[1];
> +    t->V_stride =
> t->reconstructed_frames[0]->reconstructed_picture.linesize[2];

You should really use shorter names for these data structures.

> +#if 0   // Only used for dev & debug

Why?  In CAVLC it's easy to seek back and rewrite a block if it went
over the max MB size (and write PCM instead).

> +static inline void h264_neighbour_count_nonzero(MacroBlock *mb, int type,
> int x, int y, int *nA, int *nB)
> +{
> +    if (type == NEIGHBOUR_SUBTYPE_Y)
> +        H264_NEIGHBOUR_COUNT_NONZERO_PLANE(Y_nonzero, 3)
> +        else if (type == NEIGHBOUR_SUBTYPE_U)
> +            H264_NEIGHBOUR_COUNT_NONZERO_PLANE(U_nonzero, 1)
> +            else
> +                H264_NEIGHBOUR_COUNT_NONZERO_PLANE(V_nonzero, 1)
> +}

This should be done in a fill_caches-like function.

> +static const int8_t zigzagx[16] = { 0,1,0,0,1,2,3,2,1,0,1,2,3,3,2,3 };
> +static const int8_t zigzagy[16] = { 0,0,1,2,1,0,0,1,2,3,3,2,1,2,3,3 };
> +
> +#define H264_ENCODE_INTRA16X16_RESIDUAL_COEFFICIENTS(PLANE) \
> +coefficients[0] = PLANE[0][0]; \
> +coefficients[1] = PLANE[0][1]; \
> +coefficients[2] = PLANE[1][0]; \
> +coefficients[3] = PLANE[1][1]; \
> +h264cavlc_encode(b, coefficients, 4, -1, -1, 1); // nA and nB are not used
> in this case

Instead of making everything a define why not--god forbid--make a function?

> +static void h264_encode_intra16x16_residual(PutBitContext *b, DCTELEM
> YD[4][4], DCTELEM UD[2][2], DCTELEM VD[2][2],
> +                                            Residual *residual, int
> lumamode, int chromamode, MacroBlock *mb)
> +{
> +    int lumaACcount = 0;
> +    int chromaDCcount = 0;
> +    int chromaACcount = 0;
> +    int CodedBlockPatternChroma = 0;
> +    int CodedBlockPatternLuma = 0;
> +    int x, y, i, j;
> +    DCTELEM coefficients[256];
> +    int nA, nB;
> +
> +    for (y = 0 ; y < 4 ; y++)
> +        for (x = 0 ; x < 4 ; x++)
> +            H264_COUNT_AND_CLIP_SUBBLOCK(residual->part4x4Y[y][x],
> lumaACcount);
> +
> +    for (y = 0 ; y < 2 ; y++)
> +    {
> +        for (x = 0 ; x < 2 ; x++)
> +        {
> +            H264_COUNT_AND_CLIP_SUBBLOCK(residual->part4x4U[y][x],
> chromaACcount);
> +            H264_COUNT_AND_CLIP_SUBBLOCK(residual->part4x4V[y][x],
> chromaACcount);
> +        }
> +    }
> +
> +    for (y = 0 ; y < 2 ; y++)
> +    {
> +        for (x = 0 ; x < 2 ; x++)
> +        {
> +            H264_COUNT_AND_CLIP(UD[y][x], chromaDCcount);
> +            H264_COUNT_AND_CLIP(VD[y][x], chromaDCcount);
> +        }
> +    }
> +
> +    for (y = 0 ; y < 4 ; y++)
> +        for (x = 0 ; x < 4 ; x++)
> +            clip_inplace(YD[y][x], -2047, 2047);

None of this code should even exist.  You don't need to count the
number of coefficients if you're doing things right; it gets counted
inside the entropy coder.

> +    set_se_golomb(b, 0); // mb_qp_delta

Why hardcode the most useful syntax element in H.264 to zero?

> +    for (i = 0 ; i < 16 ; i++)
> +        coefficients[i] = YD[zigzagy[i]][zigzagx[i]];

Make this a function.

> +        { { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}, {
> 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243} },
> +        { { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}, {
> 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660} },
> +        { { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}, {
> 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194} },
> +        { {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}, {
>  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647} },
> +        { {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}, {
>  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355} },
> +        { {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}, {
>  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893} }

Didn't you already declare this earlier?

> +static inline void ff_h264_hadamard_inv_quant_2x2(DCTELEM Y[2][2], int QP)
> +{
> +    int32_t V = h264_V00[QP%6];
> +    int div = QP/6;
> +
> +    V <<= div;
> +    Y[0][0] = (Y[0][0]*V) >> 5;
> +    Y[0][1] = (Y[0][1]*V) >> 5;
> +    Y[1][0] = (Y[1][0]*V) >> 5;
> +    Y[1][1] = (Y[1][1]*V) >> 5;
> +}

Why do you combine some dequant functions with inverse transform, but
others you don't?

> +/*
> + * Only if qpprime_y_zero_transform_bypass_flag == 0
> + */

Whaaaaat?  Everything above assumes baseline-only, and now we have
something restricted to High 4:4:4 Predictive?

> +    int leftavail = 0;
> +    int topavail = 0;
> +    int topleftavail = 0;

This stuff should have been calculated in fill_caches or similar.

> +    enum {
> +        H264_INTRA_DC = 0,
> +        H264_INTRA_VERTICAL,
> +        H264_INTRA_HORIZONTAL,
> +        H264_INTRA_PLANE
> +    };

These are already defined in ffmpeg.

> +    if (available_prediction_strategies & (1 << H264_INTRA_PLANE))
> +    {
> +        t->hpc.pred16x16[PLANE_PRED8x8](destmb->Y[0], t->refframe_width);
> +        /* Calculate the SAD using this prediction */
> +        sad = t->dspcontext.pix_abs[0][0](0, targetmb->Y[0], destmb->Y[0],
> t->Y_stride, 16);
> +        if (sad < lowest_sad)
> +        {
> +            lowest_sad = sad;
> +            prediction_strategy = H264_INTRA_PLANE;
> +        }
> +        if (DEBUG_INTRA_PREDICTION)
> +            av_log(avctx, AV_LOG_DEBUG, "plane sad: %d\n", sad);
> +    }
> +
> +    if (available_prediction_strategies & (1 << H264_INTRA_HORIZONTAL))
> +    {
> +        t->hpc.pred16x16[HOR_PRED8x8](destmb->Y[0], t->refframe_width);
> +        /* Calculate the SAD using this prediction */
> +        sad = t->dspcontext.pix_abs[0][0](0, targetmb->Y[0], destmb->Y[0],
> t->Y_stride, 16);
> +        if (sad < lowest_sad)
> +        {
> +            lowest_sad = sad;
> +            prediction_strategy = H264_INTRA_HORIZONTAL;
> +        }
> +        if (DEBUG_INTRA_PREDICTION)
> +            av_log(avctx, AV_LOG_DEBUG, "horizontal sad: %d\n", sad);
> +    }
> +
> +    if (available_prediction_strategies & (1 << H264_INTRA_VERTICAL))
> +    {
> +        t->hpc.pred16x16[VERT_PRED8x8](destmb->Y[0], t->refframe_width);
> +        /* Calculate the SAD using this prediction */
> +        sad = t->dspcontext.pix_abs[0][0](0, targetmb->Y[0], destmb->Y[0],
> t->Y_stride, 16);
> +        if (sad < lowest_sad)
> +        {
> +            lowest_sad = sad;
> +            prediction_strategy = H264_INTRA_VERTICAL;
> +        }
> +        if (DEBUG_INTRA_PREDICTION)
> +            av_log(avctx, AV_LOG_DEBUG, "vertical sad: %d\n", sad);
> +    }

Holy code duplication batman.

> +    if (available_prediction_strategies & (1 << H264_INTRA_DC))
> +    {
> +        if(topavail)
> +        {
> +            if(leftavail)
> +            {
> +                if (DEBUG_INTRA_PREDICTION)
> +                    av_log(avctx, AV_LOG_DEBUG, "all avail, using
> average\n");
> +                t->hpc.pred16x16[DC_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +            }
> +            else
> +            {
> +                if (DEBUG_INTRA_PREDICTION)
> +                    av_log(avctx, AV_LOG_DEBUG, "top avail, using average
> of top\n");
> +                t->hpc.pred16x16[TOP_DC_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +            }
> +        }
> +        else
> +        {
> +            if(leftavail)
> +            {
> +                if (DEBUG_INTRA_PREDICTION)
> +                    av_log(avctx, AV_LOG_DEBUG, "left avail, using average
> of left\n");
> +                t->hpc.pred16x16[LEFT_DC_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +            }
> +            else {
> +                if (DEBUG_INTRA_PREDICTION)
> +                    av_log(avctx, AV_LOG_DEBUG, "nothing avail, using
> 128\n");
> +                t->hpc.pred16x16[DC_128_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +            }
> +        }

Wayyyy too complicated.

> +        /* Calculate the SAD using this prediction */
> +        sad = t->dspcontext.pix_abs[0][0](0, targetmb->Y[0], destmb->Y[0],
> t->Y_stride, 16);
> +        if (sad < lowest_sad)
> +        {
> +            lowest_sad = sad;
> +            prediction_strategy = H264_INTRA_DC;
> +        }
> +        if (DEBUG_INTRA_PREDICTION)
> +            av_log(avctx, AV_LOG_DEBUG, "DC sad: %d\n", sad);
> +    }

Why are we using SAD instead of the user-selected mbcmp?

> +    /* Use the prediction strategy which resulted in the lowest SAD */
> +    switch(prediction_strategy)
> +    {
> +        case H264_INTRA_PLANE:
> +            if (DEBUG_INTRA_PREDICTION)
> +                av_log(avctx, AV_LOG_DEBUG, "INTRA_PLANE\n");
> +            t->hpc.pred16x16[PLANE_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +            t->hpc.pred8x8[PLANE_PRED8x8](destmb->U[0],
> t->refframe_width>>1);
> +            t->hpc.pred8x8[PLANE_PRED8x8](destmb->V[0],
> t->refframe_width>>1);
> +            lumapredmode = PLANE_PRED8x8;
> +            chromapredmode = PLANE_PRED8x8;
> +            break;
> +
> +        case H264_INTRA_HORIZONTAL:
> +            if (DEBUG_INTRA_PREDICTION)
> +                av_log(avctx, AV_LOG_DEBUG, "INTRA_HOR\n");
> +            // Horizontal prediction
> +            t->hpc.pred16x16[HOR_PRED8x8](destmb->Y[0], t->refframe_width);
> +            t->hpc.pred8x8[HOR_PRED8x8](destmb->U[0],
> t->refframe_width>>1);
> +            t->hpc.pred8x8[HOR_PRED8x8](destmb->V[0],
> t->refframe_width>>1);
> +            lumapredmode = HOR_PRED8x8;
> +            chromapredmode = HOR_PRED8x8;
> +            break;
> +
> +        case H264_INTRA_VERTICAL:
> +            if (DEBUG_INTRA_PREDICTION)
> +                av_log(avctx, AV_LOG_DEBUG, "INTRA_VERTICAL\n");
> +            // Vertical prediction
> +            t->hpc.pred16x16[VERT_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +            t->hpc.pred8x8[VERT_PRED8x8](destmb->U[0],
> t->refframe_width>>1);
> +            t->hpc.pred8x8[VERT_PRED8x8](destmb->V[0],
> t->refframe_width>>1);
> +            lumapredmode = VERT_PRED;
> +            chromapredmode = VERT_PRED8x8;
> +            break;
> +
> +
> +        case H264_INTRA_DC:
> +            if (DEBUG_INTRA_PREDICTION)
> +                av_log(avctx, AV_LOG_DEBUG, "INTRA_DC\n");
> +            if ((!topavail) && (!leftavail))
> +                //if (1) // XXX: Illegal! Decoder will not know that 128
> was used,
> +                //but will use prediction based on previous blocks.
> +            {
> +                if (DEBUG_INTRA_PREDICTION)
> +                    av_log(avctx, AV_LOG_DEBUG, "nothing avail, using
> 128\n");
> +                t->hpc.pred16x16[DC_128_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +                t->hpc.pred8x8[DC_128_PRED8x8](destmb->U[0],
> t->refframe_width>>1);
> +                t->hpc.pred8x8[DC_128_PRED8x8](destmb->V[0],
> t->refframe_width>>1);
> +            }

Why do we assume the optimal chroma pred mode is the same as luma?

> +            else
> +            {
> +                if(topavail)
> +                {
> +                    if(leftavail)
> +                    {
> +                        if (DEBUG_INTRA_PREDICTION)
> +                            av_log(avctx, AV_LOG_DEBUG, "all avail, using
> average\n");
> +                        t->hpc.pred16x16[DC_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +                        t->hpc.pred8x8[DC_PRED8x8](destmb->U[0],
> t->refframe_width>>1);
> +                        t->hpc.pred8x8[DC_PRED8x8](destmb->V[0],
> t->refframe_width>>1);
> +                    }
> +                    else
> +                    {
> +                        if (DEBUG_INTRA_PREDICTION)
> +                            av_log(avctx, AV_LOG_DEBUG, "top avail, using
> average of top\n");
> +                        t->hpc.pred16x16[TOP_DC_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +                        t->hpc.pred8x8[TOP_DC_PRED8x8](destmb->U[0],
> t->refframe_width>>1);
> +                        t->hpc.pred8x8[TOP_DC_PRED8x8](destmb->V[0],
> t->refframe_width>>1);
> +                    }
>                 }
> -//                last_non_zero = i;
> -            } else {
> -                block[0] = 0;
> +                else
> +                {
> +                    if(leftavail)
> +                    {
> +                        if (DEBUG_INTRA_PREDICTION)
> +                            av_log(avctx, AV_LOG_DEBUG, "left avail, using
> average of left\n");
> +                        t->hpc.pred16x16[LEFT_DC_PRED8x8](destmb->Y[0],
> t->refframe_width);
> +                        t->hpc.pred8x8[LEFT_DC_PRED8x8](destmb->U[0],
> t->refframe_width>>1);
> +                        t->hpc.pred8x8[LEFT_DC_PRED8x8](destmb->V[0],
> t->refframe_width>>1);
> +                    }
> +                    else
> +                    {
> +                        av_log(avctx, AV_LOG_ERROR, "error! we should not
> get here!\n");
> +
> +                    }
> +                }
>             }
> +            lumapredmode = DC_PRED;
> +            chromapredmode = DC_PRED8x8;
> +            break;
> +    }

Holy crap, you just basically copy-pasted this entire code block.  Bad kitty.

> +static void h264_encode_inter16x16_residual(H264EncContext *t,

Aaaannd the patch is cut off.

Summary so far: What is this shit?

Dark Shikari