[FFmpeg-devel] [PATCH] Mimic decoder

Fri Mar 14 22:14:29 CET 2008

Hello,

I'm sending the updated patch to Michael's review.

[...]
>> typedef struct {
>>     int width;
>>     int height;
>>     enum PixelFormat pix_fmt;
> 
> Why do you duplicate these here?  They are already in AVCodecContext.

Added a pointer to AVCodecContext instead.

[...]
>> static void bswap_buf(uint8_t *newbuf, const uint8_t *oldbuf, int bufsize)
>> {
>>     int i;
>>     for(i = 0 ; i < (bufsize >> 2) ; i++)
>>         ((uint32_t*) newbuf)[i] = bswap_32(((const uint32_t*) oldbuf)[i]);
>> }
> 
> Use DSPContext.bswap_buf().

Done.

[...]
>> static int vlc_decode_block(MimicContext *ctx, DCTELEM *block, int num_coeffs)
>> {
>>     unsigned int pos;
>>
>>     memset(block, 0, 64 * sizeof(DCTELEM));
>>
>>     /* The DC-value is read in as is. */
>>     block[0] = get_bits(&ctx->gb, 8);
>>
>>     for(pos = 1; pos < num_coeffs; pos++) {
>>         uint32_t vlc, num_bits;
>>         int value;
>>
>>         vlc = get_vlc2(&ctx->gb, ctx->vlc1.table, ctx->vlc1.bits, 4);
>>         if(vlc == -1)
>>             return 0;
>>         if(vlc == 3) /* end-of-block code */
>>             return 1;
>>
>>         pos +=     magic_values[vlc][0];
>>         num_bits = magic_values[vlc][1];
>>
>>         value = get_bits(&ctx->gb, num_bits);
>>
>>         block[ctx->scantable.permutated[pos]] = ctx->vlcdec_lookup[num_bits][value];
>>     }
> 
> It would be nice if you could somehow avoid that memset().  Do you
> have any idea how many coefficients are typically coded?  Maybe it's
> few enough that it doesn't matter.

I'm not sure it's that easy, unless you elaborate on the idea... There 
are at most 28 coeffs on MSN Messenger's videos.

>>     return 1;
>> }
>>
>> static void dequant_block(MimicContext *ctx, DCTELEM *block, int is_chrom)
>> {
>>     int i, f = ctx->f[is_chrom];
> 
> Why don't you make 'f' an argument to this function instead?

Done.

>>     /* FFmpeg's IDCT behaves somewhat different from the original code, so
>>      * a factor of 4 was added to the input */
>>
>>     /* De-quantize. */
>>     block[0] <<= 3;
>>     block[1] <<= 4;
>>     block[8] <<= 4;
>>
>>     for(i = 2; i < 64; i++) {
>>         if(i == 8)
>>             continue;
>>
>>         /* TODO Use >> 10 instead of / 1001 */
>>         block[i] = (block[i] * f) / 1001;
>>     }
>> }
> 
> I doubt this works properly with permuted blocks.

Fixed while moving dequant to deblock.

>> /*
>>  * decode_main
>>  *
>>  * Main decoding loop.
>>  */
>> static int decode(MimicContext *ctx, int quality, int num_coeffs, int is_pframe)
>> {
>>     int y, x, i, chrom_ch, offset;
>>     DECLARE_ALIGNED_16(DCTELEM, dct_block[64]);
>>     uint8_t *src, *dst, *p;
>>     uint8_t *base_src, *base_dst;
>>     uint32_t bit;
>>     int y_stride = ctx->picture.linesize[0];
>>     int crcb_stride = ctx->picture.linesize[1];
>>     int crcb_size = ctx->picture.linesize[1] * (ctx->height>>1);
>>
>>     /* Clear Cr and Cb planes. */
>>     memset(ctx->picture.data[1], 128, crcb_size);
>>     memset(ctx->picture.data[2], 128, crcb_size);
> 
> Why?

Removed. They always get written to anyways.

>>     ctx->f[0] = av_clip(10000-quality, 1000, 10000)<<2;
>>     ctx->f[1] = av_clip(10000-quality, 2000, 10000)<<2;
>>
>>     /* Decode Y plane. */
>>     for(y = ctx->num_vblocks_y - 1; y >= 0 ; y--) {
> 
> What is it with Microsoft and inverted images?

...

>>         offset = (y_stride << 3) * y;
>>         src = ctx->prev_frame.data[0] + offset;
>>         dst = ctx->picture.data[0]  + offset;
>>         for(x = 0; x < ctx->num_hblocks_y; x++) {
>>             /* Check for a change condition in the current block. */
>>             bit = is_pframe ? get_bits1(&ctx->gb) : 0;
>>             if(!bit) {
>>                 /* Yes: Is the new content the same as it was in one of
>>                  * the 15 last frames preceding the previous? */
>>                 if(is_pframe)
>>                     bit = get_bits1(&ctx->gb);
> 
> This looks like it could be a case for decode012(), not sure.

This can't be done with the merged loops anymore.

>>                 if(!bit) {
>>                     /* No: decode it. */
>>                     if(!vlc_decode_block(ctx, dct_block, num_coeffs)) {
>>                         /* Corruped frame, return. */
>>                         return 0;
>>                     }
>>                     dequant_block(ctx, dct_block, 0);
>>                     ctx->dsp.idct_put(dst+7*y_stride, -y_stride, dct_block);
>>                 } else {
>>                     uint32_t backref;
> 
> Make that a plain 'unsigned'.  There is no need to force exactly 32 bits.

Ok.

>>                     /* Yes: read the backreference (4 bits) and copy. */
>>                     backref = get_bits(&ctx->gb, 4);
>>                     p = ctx->buf_ptrs[(ctx->ptr_index + backref) % 16].data[0];
> 
> Do you trust the compiler with the % operator?  Better use & 15 instead.

Ok.

>>                     p += offset + (x << 3);
>>                     for(i = 7; i >= 0; i--) {
>>                         int new_offset = y_stride * i;
>>                         memcpy(dst + new_offset, p + new_offset, 8);
>>                     }
> 
> for (i = 0; i < 8; i++) {
>     memcpy(dst + offset, p + offset, 8);
>     offset += y_stride;
> }

dsp.put_pixels_tab used instead as suggested by Michael.

>>                 }
>>             } else {
>>                 /* No change no worries: just copy from the previous frame. */
>>                 for(i = 7; i >= 0; i--) {
>>                     int new_offset = y_stride * i;
>>                     memcpy(dst + new_offset, src + new_offset, 8);
>>                 }
> 
> Can you think of a nice way to combine those two identical for()
> loops?

One takes p as source, the other takes src. dsp.put_pixels_tab is 
one-lined now, so it's pretty simplified.

>>             }
>>             src += 8;
>>             dst += 8;
>>         }
>>     }
>>
>>     /* Decode Cr and Cb planes. */
>>     for(chrom_ch = 0; chrom_ch < 2; chrom_ch++) {
>>         base_src = (chrom_ch ? ctx->prev_frame.data[1] : ctx->prev_frame.data[2]);
>>         base_dst = (chrom_ch ? ctx->picture.data[1] : ctx->picture.data[2]);
> 
> for (chrom_ch = 1; chrom_ch <= 2; chrom_ch++) {
>     base_dst = ctx->prev_frame.data[3-chrom_ch];
>     ...

With the merged loops it was done differently.

>>         for(y = ctx->num_vblocks_cbcr - 1; y >= 0 ; y--) {
>>             unsigned int num_rows = 8;
>>             /* The last row of blocks in chrominance for 160x120 resolution
>>              * is half the normal height and must be accounted for. */
>>             if(y + 1 == ctx->num_vblocks_cbcr && ctx->height % 16)
>>                 num_rows = 4;
> 
> How does this interact with 8x8 IDCT blocks?

I don't have 160x120 samples to test yet.

> And never use the % operator.

Ok.

>>             offset = (crcb_stride << 3) * y;
>>             src = base_src + offset;
>>             dst = base_dst + offset;
>>             for(x = 0; x < ctx->num_hblocks_cbcr; x++) {
>>                 /* Check for a change condition in the current block. */
>>                 bit = is_pframe ? get_bits1(&ctx->gb) : 1;
>>                 if(bit == 1) {
>>                     /* Yes: decode it. */
>>                     if(!vlc_decode_block(ctx, dct_block, num_coeffs)) {
>>                         /* Corrupted frame: clear Cr and Cb planes and return. */
>>                         memset(ctx->picture.data[1], 128, crcb_size);
>>                         memset(ctx->picture.data[2], 128, crcb_size);
> 
> Why?

Removed. The frame will be innacurate anyways if decoding fails...

>>                         return 0;
>>                     }
>>                     dequant_block(ctx, dct_block, 1);
>>                     ctx->dsp.idct_put(dst+7*crcb_stride, -crcb_stride, dct_block);
>>                 } else {
>>                     /* No change no worries: just copy from the previous frame. */
>>                     for(i = num_rows-1; i >= 0; i--) {
>>                         int new_offset = crcb_stride * i;
>>                         memcpy(dst + new_offset, src + new_offset, 8);
>>                     }
>>                 }
>>                 src += 8;
>>                 dst += 8;
>>             }
>>         }
>>     }
> 
> The chroma code is very similar to the luma code.  No chance they
> could be merged?

Merged. Cleaner code = 3.8% slowdown though =(

>>     /*
>>      * Make a copy of the current frame and store in
>>      * the circular pointer list of 16 entries.
>>      */
>>     ctx->prev_frame = ctx->buf_ptrs[ctx->ptr_index];
>>
>>     av_picture_copy((AVPicture*)&ctx->prev_frame, (AVPicture*)&ctx->picture,
>>                     ctx->pix_fmt, ctx->width, ctx->height);
>>
>>     if(--ctx->ptr_index < 0)
>>         ctx->ptr_index = 15;
> 
> This isn't speed critical, but ctx->ptr_index--; ctx->ptr_index &= 15
> is faster.

Ok.

>> #ifdef MIMIC_POSTPROC
>>     /* Perform deblocking on all planes. */
>>     deblock(ctx->picture.data[0], y_stride, ctx->height);
>>     deblock(ctx->picture.data[1], crcb_stride, ctx->height>>1);
>>     deblock(ctx->picture.data[2], crcb_stride, ctx->height>>1);
>> #endif
>>
>>     return 1;
>> }
>>
>> #ifdef MIMIC_POSTPROC
> 
> [...]
> 
> Not commenting on this for now.

Post-processing was removed.

>> #endif
>>
>> static const uint8_t col_zag[64] = {
>>      0,  8,  1,  2,  9, 16, 24, 17,
>>     10,  3,  4, 11, 18, 25, 32, 40,
>>     33, 26, 19, 12,  5,  6, 13, 20,
>>     27, 34, 41, 48, 56, 49, 42, 35,
>>     28, 21, 14,  7, 15, 22, 29, 36,
>>     43, 50, 57, 58, 51, 44, 37, 30,
>>     23, 31, 38, 45, 52, 59, 39, 46,
>>     53, 60, 61, 54, 47, 55, 62, 63
>> };
>>
>> static const uint32_t huffcodes[] = {
>>     0x00000000, 0x00000001, 0x00000004, 0x0000000a, 0x0000000b, 0x0000000c,
>>     0x0000001a, 0x0000001b, 0x00000038, 0x00000039, 0x0000003a, 0x0000003b,
>>     0x00000078, 0x00000079, 0x0000007a, 0x0000007b, 0x000000f8, 0x000000f9,
>>     0x000000fa, 0x000000fb, 0x000001f8, 0x000001f9, 0x000001fa, 0x000001fb,
>>     0x000003f8, 0x000003f9, 0x000003fa, 0x000003fb, 0x000007f8, 0x000007f9,
>>     0x000007fa, 0x000007fb, 0x00000ff8, 0x00000ff9, 0x00000ffa, 0x00000ffb,
>>     0x00001ff8, 0x00001ff9, 0x00001ffa, 0x00001ffb, 0x00003ff8, 0x00003ff9,
>>     0x00003ffa, 0x00003ffb, 0x00007ff8, 0x00007ff9, 0x00007ffa, 0x00007ffb,
>>     0x0000fff8, 0x0000fff9, 0x0000fffa, 0x0000fffb, 0x0001fff8, 0x0001fff9,
>>     0x0001fffa, 0x0001fffb, 0x0003fff8, 0x0003fff9, 0x0003fffa, 0x0003fffb,
>>     0x0007fff8, 0x0007fff9, 0x0007fffa, 0x0007fffb, 0x000ffff8, 0x000ffff9,
>>     0x000ffffa, 0x000ffffb, 0x001ffff8, 0x001ffff9, 0x001ffffa, 0x001ffffb,
>>     0x003ffff8, 0x003ffff9, 0x003ffffa, 0x003ffffb, 0x007ffff8, 0x007ffff9,
>>     0x007ffffa, 0x007ffffb, 0x00fffff8, 0x00fffff9, 0x00fffffa, 0x00fffffb,
>>     0x01fffff8, 0x01fffff9, 0x01fffffa, 0x01fffffb, 0x03fffff8, 0x03fffff9,
>>     0x03fffffa, 0x03fffffb, 0x07fffff8, 0x07fffff9, 0x07fffffa, 0x07fffffb,
>>     0x0ffffff8, 0x0ffffff9, 0x0ffffffa, 0x0ffffffb, 0x1ffffff8, 0x1ffffff9,
>>     0x1ffffffa, 0x1ffffffb, 0x3ffffff8, 0x3ffffff9, 0x3ffffffa,
>> };
>>
>> static const uint8_t huffbits[] = {
>>      2,  2,  3,  4,  4,  4,  5,  5,  6,  6,  6,  6,
>>      7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,
>>     10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12,
>>     13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
>>     16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
>>     19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
>>     22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24,
>>     25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27,
>>     28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30,
>> };
> 
> Those last two tables look like they could be computed by some simple
> expressions (which I am too lazy to work out right now).

I used Michael's approach to code the values into the VLC codes. I think 
it's not possible to compute the table easily anymore.

[...]
>> static int mimic_decode_init(AVCodecContext *avctx)
>> {
>>     MimicContext *ctx = avctx->priv_data;
>>     int width, height;
>>     int i;
>>
>>     if(!avctx->extradata) {
>>         av_log(avctx, AV_LOG_ERROR, "extradata missing!\n");
>>         return -1;
>>     }
> 
> You should check the size of extradata too.

Checked.

>>     width  = AV_RL16(avctx->extradata + 4);
>>     height = AV_RL16(avctx->extradata + 6);
>>
>>     if(!(width == 160 && height == 120) && !(width == 320 && height == 240)) {
>>         av_log(avctx, AV_LOG_ERROR, "invalid width/height!\n");
>>         return -1;
>>     }
>>
>>     ctx->width  = avctx->width  = width ;
>>     ctx->height = avctx->height = height;
>>     ctx->pix_fmt = avctx->pix_fmt = PIX_FMT_YUV420P;
> 
> Why?

Because.

I think you mean "Why duplicate values from avctx->width into ctx?". If 
that's the case, then it's fixed...

>>     ctx->num_vblocks_y = ctx->height >> 3;
>>     ctx->num_hblocks_y = ctx->width  >> 3;
>>     ctx->num_vblocks_cbcr = ctx->height >> 4;
>>     ctx->num_hblocks_cbcr = ctx->width  >> 4;
>>     if(ctx->height % 16)
>>         ctx->num_vblocks_cbcr++;
> 
> ctx->num_vblocks_cbcr += !!(ctx->height & 15), if you like a little
> obfuscation.  In a speed-critical place, it would probably be faster.

I think it's too obfuscated for a function that will only be called 
once. Only changed %16 to &15.

[...]
>> static int mimic_decode_frame(AVCodecContext *avctx, void *data,
>>                               int *data_size, const uint8_t *buf, int buf_size)
>> {
>>     MimicContext *ctx = avctx->priv_data;
>>     int is_pframe;
>>     int width, height;
>>     int quality, num_coeffs;
>>
>>     if(ctx->picture.data[0]) {
>>         avctx->release_buffer(avctx, &ctx->picture);
>>     }
>>
>>     if(avctx->get_buffer(avctx, &ctx->picture)) {
>>         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
>>         return -1;
>>     }
>>
>>     width  = AV_RL16(buf + 4);
>>     height = AV_RL16(buf + 6);
>>
>>     if(width != ctx->width || height != ctx->height) {
>>         av_log(avctx, AV_LOG_ERROR, "resolution changing is not supported\n");
>>         return -1;
>>     }
>>
>>     quality = AV_RL16(buf + 2);
>>     is_pframe = AV_RL32(buf + 12);
>>     num_coeffs = buf[16];
> 
> You should check the buffer size before reading.

Checked.

>>     if(!(is_pframe && !ctx->prev_frame.data[0])) {
>>         int swap_buf_size = buf_size - 20;
>>         ctx->swap_buf = av_fast_realloc(ctx->swap_buf, &ctx->swap_buf_size,
>>                                  swap_buf_size + FF_INPUT_BUFFER_PADDING_SIZE);
>>         if(!ctx->swap_buf)
>>             return AVERROR_NOMEM;
>>
>>         /* TODO find out why dsp.bswap_buf crashes */
> 
> Did you ask Valgrind?

Yes. He told me I was reading and writing where I shouldn't have been...

>>         bswap_buf(ctx->swap_buf, buf + 20, swap_buf_size);
>>         init_get_bits(&ctx->gb, ctx->swap_buf, swap_buf_size << 3);
>>
>>         if(!decode(ctx, quality, num_coeffs, is_pframe))
>>             return -1;
>>     } else {
>>         av_log(avctx, AV_LOG_ERROR, "decoding must start with keyframe\n");
>>         return -1;
>>     }
>>
>>     ctx->picture.pict_type = is_pframe ? FF_P_TYPE : FF_I_TYPE;
>>     *(AVFrame*)data = ctx->picture;
>>     *data_size = sizeof(AVFrame);
>>
>>     return buf_size;
>> }
>>
>> static int mimic_decode_end(AVCodecContext *avctx)
>> {
>>     MimicContext *ctx = avctx->priv_data;
>>     int i;
>>
>>     if(ctx->swap_buf)
>>         av_free(ctx->swap_buf);
> 
> Useless if().

Removed.

Ramiro Polla