[FFmpeg-devel] [PATCH] Mimic decoder
Ramiro Polla
ramiro
Fri Mar 14 22:14:29 CET 2008
Hello,
I'm sending the updated patch to Michael's review.
[...]
>> typedef struct {
>> int width;
>> int height;
>> enum PixelFormat pix_fmt;
>
> Why do you duplicate these here? They are already in AVCodecContext.
Added a pointer to AVCodecContext instead.
[...]
>> static void bswap_buf(uint8_t *newbuf, const uint8_t *oldbuf, int bufsize)
>> {
>> int i;
>> for(i = 0 ; i < (bufsize >> 2) ; i++)
>> ((uint32_t*) newbuf)[i] = bswap_32(((const uint32_t*) oldbuf)[i]);
>> }
>
> Use DSPContext.bswap_buf().
Done.
[...]
>> static int vlc_decode_block(MimicContext *ctx, DCTELEM *block, int num_coeffs)
>> {
>> unsigned int pos;
>>
>> memset(block, 0, 64 * sizeof(DCTELEM));
>>
>> /* The DC-value is read in as is. */
>> block[0] = get_bits(&ctx->gb, 8);
>>
>> for(pos = 1; pos < num_coeffs; pos++) {
>> uint32_t vlc, num_bits;
>> int value;
>>
>> vlc = get_vlc2(&ctx->gb, ctx->vlc1.table, ctx->vlc1.bits, 4);
>> if(vlc == -1)
>> return 0;
>> if(vlc == 3) /* end-of-block code */
>> return 1;
>>
>> pos += magic_values[vlc][0];
>> num_bits = magic_values[vlc][1];
>>
>> value = get_bits(&ctx->gb, num_bits);
>>
>> block[ctx->scantable.permutated[pos]] = ctx->vlcdec_lookup[num_bits][value];
>> }
>
> It would be nice if you could somehow avoid that memset(). Do you
> have any idea how many coefficients are typically coded? Maybe it's
> few enough that it doesn't matter.
I'm not sure it's that easy, unless you elaborate on the idea... There
are at most 28 coeffs on MSN Messenger's videos.
>> return 1;
>> }
>>
>> static void dequant_block(MimicContext *ctx, DCTELEM *block, int is_chrom)
>> {
>> int i, f = ctx->f[is_chrom];
>
> Why don't you make 'f' an argument to this function instead?
Done.
>> /* FFmpeg's IDCT behaves somewhat different from the original code, so
>> * a factor of 4 was added to the input */
>>
>> /* De-quantize. */
>> block[0] <<= 3;
>> block[1] <<= 4;
>> block[8] <<= 4;
>>
>> for(i = 2; i < 64; i++) {
>> if(i == 8)
>> continue;
>>
>> /* TODO Use >> 10 instead of / 1001 */
>> block[i] = (block[i] * f) / 1001;
>> }
>> }
>
> I doubt this works properly with permuted blocks.
Fixed while moving dequant to deblock.
>> /*
>> * decode_main
>> *
>> * Main decoding loop.
>> */
>> static int decode(MimicContext *ctx, int quality, int num_coeffs, int is_pframe)
>> {
>> int y, x, i, chrom_ch, offset;
>> DECLARE_ALIGNED_16(DCTELEM, dct_block[64]);
>> uint8_t *src, *dst, *p;
>> uint8_t *base_src, *base_dst;
>> uint32_t bit;
>> int y_stride = ctx->picture.linesize[0];
>> int crcb_stride = ctx->picture.linesize[1];
>> int crcb_size = ctx->picture.linesize[1] * (ctx->height>>1);
>>
>> /* Clear Cr and Cb planes. */
>> memset(ctx->picture.data[1], 128, crcb_size);
>> memset(ctx->picture.data[2], 128, crcb_size);
>
> Why?
Removed. They always get written to anyways.
>> ctx->f[0] = av_clip(10000-quality, 1000, 10000)<<2;
>> ctx->f[1] = av_clip(10000-quality, 2000, 10000)<<2;
>>
>> /* Decode Y plane. */
>> for(y = ctx->num_vblocks_y - 1; y >= 0 ; y--) {
>
> What is it with Microsoft and inverted images?
...
>> offset = (y_stride << 3) * y;
>> src = ctx->prev_frame.data[0] + offset;
>> dst = ctx->picture.data[0] + offset;
>> for(x = 0; x < ctx->num_hblocks_y; x++) {
>> /* Check for a change condition in the current block. */
>> bit = is_pframe ? get_bits1(&ctx->gb) : 0;
>> if(!bit) {
>> /* Yes: Is the new content the same as it was in one of
>> * the 15 last frames preceding the previous? */
>> if(is_pframe)
>> bit = get_bits1(&ctx->gb);
>
> This looks like it could be a case for decode012(), not sure.
This can't be done with the merged loops anymore.
>> if(!bit) {
>> /* No: decode it. */
>> if(!vlc_decode_block(ctx, dct_block, num_coeffs)) {
>> /* Corruped frame, return. */
>> return 0;
>> }
>> dequant_block(ctx, dct_block, 0);
>> ctx->dsp.idct_put(dst+7*y_stride, -y_stride, dct_block);
>> } else {
>> uint32_t backref;
>
> Make that a plain 'unsigned'. There is no need to force exactly 32 bits.
Ok.
>> /* Yes: read the backreference (4 bits) and copy. */
>> backref = get_bits(&ctx->gb, 4);
>> p = ctx->buf_ptrs[(ctx->ptr_index + backref) % 16].data[0];
>
> Do you trust the compiler with the % operator? Better use & 15 instead.
Ok.
>> p += offset + (x << 3);
>> for(i = 7; i >= 0; i--) {
>> int new_offset = y_stride * i;
>> memcpy(dst + new_offset, p + new_offset, 8);
>> }
>
> for (i = 0; i < 8; i++) {
> memcpy(dst + offset, p + offset, 8);
> offset += y_stride;
> }
dsp.put_pixels_tab used instead as suggested by Michael.
>> }
>> } else {
>> /* No change no worries: just copy from the previous frame. */
>> for(i = 7; i >= 0; i--) {
>> int new_offset = y_stride * i;
>> memcpy(dst + new_offset, src + new_offset, 8);
>> }
>
> Can you think of a nice way to combine those two identical for()
> loops?
One takes p as source, the other takes src. dsp.put_pixels_tab is
one-lined now, so it's pretty simplified.
>> }
>> src += 8;
>> dst += 8;
>> }
>> }
>>
>> /* Decode Cr and Cb planes. */
>> for(chrom_ch = 0; chrom_ch < 2; chrom_ch++) {
>> base_src = (chrom_ch ? ctx->prev_frame.data[1] : ctx->prev_frame.data[2]);
>> base_dst = (chrom_ch ? ctx->picture.data[1] : ctx->picture.data[2]);
>
> for (chrom_ch = 1; chrom_ch <= 2; chrom_ch++) {
> base_dst = ctx->prev_frame.data[3-chrom_ch];
> ...
With the merged loops it was done differently.
>> for(y = ctx->num_vblocks_cbcr - 1; y >= 0 ; y--) {
>> unsigned int num_rows = 8;
>> /* The last row of blocks in chrominance for 160x120 resolution
>> * is half the normal height and must be accounted for. */
>> if(y + 1 == ctx->num_vblocks_cbcr && ctx->height % 16)
>> num_rows = 4;
>
> How does this interact with 8x8 IDCT blocks?
I don't have 160x120 samples to test yet.
> And never use the % operator.
Ok.
>> offset = (crcb_stride << 3) * y;
>> src = base_src + offset;
>> dst = base_dst + offset;
>> for(x = 0; x < ctx->num_hblocks_cbcr; x++) {
>> /* Check for a change condition in the current block. */
>> bit = is_pframe ? get_bits1(&ctx->gb) : 1;
>> if(bit == 1) {
>> /* Yes: decode it. */
>> if(!vlc_decode_block(ctx, dct_block, num_coeffs)) {
>> /* Corrupted frame: clear Cr and Cb planes and return. */
>> memset(ctx->picture.data[1], 128, crcb_size);
>> memset(ctx->picture.data[2], 128, crcb_size);
>
> Why?
Removed. The frame will be innacurate anyways if decoding fails...
>> return 0;
>> }
>> dequant_block(ctx, dct_block, 1);
>> ctx->dsp.idct_put(dst+7*crcb_stride, -crcb_stride, dct_block);
>> } else {
>> /* No change no worries: just copy from the previous frame. */
>> for(i = num_rows-1; i >= 0; i--) {
>> int new_offset = crcb_stride * i;
>> memcpy(dst + new_offset, src + new_offset, 8);
>> }
>> }
>> src += 8;
>> dst += 8;
>> }
>> }
>> }
>
> The chroma code is very similar to the luma code. No chance they
> could be merged?
Merged. Cleaner code = 3.8% slowdown though =(
>> /*
>> * Make a copy of the current frame and store in
>> * the circular pointer list of 16 entries.
>> */
>> ctx->prev_frame = ctx->buf_ptrs[ctx->ptr_index];
>>
>> av_picture_copy((AVPicture*)&ctx->prev_frame, (AVPicture*)&ctx->picture,
>> ctx->pix_fmt, ctx->width, ctx->height);
>>
>> if(--ctx->ptr_index < 0)
>> ctx->ptr_index = 15;
>
> This isn't speed critical, but ctx->ptr_index--; ctx->ptr_index &= 15
> is faster.
Ok.
>> #ifdef MIMIC_POSTPROC
>> /* Perform deblocking on all planes. */
>> deblock(ctx->picture.data[0], y_stride, ctx->height);
>> deblock(ctx->picture.data[1], crcb_stride, ctx->height>>1);
>> deblock(ctx->picture.data[2], crcb_stride, ctx->height>>1);
>> #endif
>>
>> return 1;
>> }
>>
>> #ifdef MIMIC_POSTPROC
>
> [...]
>
> Not commenting on this for now.
Post-processing was removed.
>> #endif
>>
>> static const uint8_t col_zag[64] = {
>> 0, 8, 1, 2, 9, 16, 24, 17,
>> 10, 3, 4, 11, 18, 25, 32, 40,
>> 33, 26, 19, 12, 5, 6, 13, 20,
>> 27, 34, 41, 48, 56, 49, 42, 35,
>> 28, 21, 14, 7, 15, 22, 29, 36,
>> 43, 50, 57, 58, 51, 44, 37, 30,
>> 23, 31, 38, 45, 52, 59, 39, 46,
>> 53, 60, 61, 54, 47, 55, 62, 63
>> };
>>
>> static const uint32_t huffcodes[] = {
>> 0x00000000, 0x00000001, 0x00000004, 0x0000000a, 0x0000000b, 0x0000000c,
>> 0x0000001a, 0x0000001b, 0x00000038, 0x00000039, 0x0000003a, 0x0000003b,
>> 0x00000078, 0x00000079, 0x0000007a, 0x0000007b, 0x000000f8, 0x000000f9,
>> 0x000000fa, 0x000000fb, 0x000001f8, 0x000001f9, 0x000001fa, 0x000001fb,
>> 0x000003f8, 0x000003f9, 0x000003fa, 0x000003fb, 0x000007f8, 0x000007f9,
>> 0x000007fa, 0x000007fb, 0x00000ff8, 0x00000ff9, 0x00000ffa, 0x00000ffb,
>> 0x00001ff8, 0x00001ff9, 0x00001ffa, 0x00001ffb, 0x00003ff8, 0x00003ff9,
>> 0x00003ffa, 0x00003ffb, 0x00007ff8, 0x00007ff9, 0x00007ffa, 0x00007ffb,
>> 0x0000fff8, 0x0000fff9, 0x0000fffa, 0x0000fffb, 0x0001fff8, 0x0001fff9,
>> 0x0001fffa, 0x0001fffb, 0x0003fff8, 0x0003fff9, 0x0003fffa, 0x0003fffb,
>> 0x0007fff8, 0x0007fff9, 0x0007fffa, 0x0007fffb, 0x000ffff8, 0x000ffff9,
>> 0x000ffffa, 0x000ffffb, 0x001ffff8, 0x001ffff9, 0x001ffffa, 0x001ffffb,
>> 0x003ffff8, 0x003ffff9, 0x003ffffa, 0x003ffffb, 0x007ffff8, 0x007ffff9,
>> 0x007ffffa, 0x007ffffb, 0x00fffff8, 0x00fffff9, 0x00fffffa, 0x00fffffb,
>> 0x01fffff8, 0x01fffff9, 0x01fffffa, 0x01fffffb, 0x03fffff8, 0x03fffff9,
>> 0x03fffffa, 0x03fffffb, 0x07fffff8, 0x07fffff9, 0x07fffffa, 0x07fffffb,
>> 0x0ffffff8, 0x0ffffff9, 0x0ffffffa, 0x0ffffffb, 0x1ffffff8, 0x1ffffff9,
>> 0x1ffffffa, 0x1ffffffb, 0x3ffffff8, 0x3ffffff9, 0x3ffffffa,
>> };
>>
>> static const uint8_t huffbits[] = {
>> 2, 2, 3, 4, 4, 4, 5, 5, 6, 6, 6, 6,
>> 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9,
>> 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12,
>> 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
>> 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
>> 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
>> 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24,
>> 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27,
>> 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30,
>> };
>
> Those last two tables look like they could be computed by some simple
> expressions (which I am too lazy to work out right now).
I used Michael's approach to code the values into the VLC codes. I think
it's not possible to compute the table easily anymore.
[...]
>> static int mimic_decode_init(AVCodecContext *avctx)
>> {
>> MimicContext *ctx = avctx->priv_data;
>> int width, height;
>> int i;
>>
>> if(!avctx->extradata) {
>> av_log(avctx, AV_LOG_ERROR, "extradata missing!\n");
>> return -1;
>> }
>
> You should check the size of extradata too.
Checked.
>> width = AV_RL16(avctx->extradata + 4);
>> height = AV_RL16(avctx->extradata + 6);
>>
>> if(!(width == 160 && height == 120) && !(width == 320 && height == 240)) {
>> av_log(avctx, AV_LOG_ERROR, "invalid width/height!\n");
>> return -1;
>> }
>>
>> ctx->width = avctx->width = width ;
>> ctx->height = avctx->height = height;
>> ctx->pix_fmt = avctx->pix_fmt = PIX_FMT_YUV420P;
>
> Why?
Because.
I think you mean "Why duplicate values from avctx->width into ctx?". If
that's the case, then it's fixed...
>> ctx->num_vblocks_y = ctx->height >> 3;
>> ctx->num_hblocks_y = ctx->width >> 3;
>> ctx->num_vblocks_cbcr = ctx->height >> 4;
>> ctx->num_hblocks_cbcr = ctx->width >> 4;
>> if(ctx->height % 16)
>> ctx->num_vblocks_cbcr++;
>
> ctx->num_vblocks_cbcr += !!(ctx->height & 15), if you like a little
> obfuscation. In a speed-critical place, it would probably be faster.
I think it's too obfuscated for a function that will only be called
once. Only changed %16 to &15.
[...]
>> static int mimic_decode_frame(AVCodecContext *avctx, void *data,
>> int *data_size, const uint8_t *buf, int buf_size)
>> {
>> MimicContext *ctx = avctx->priv_data;
>> int is_pframe;
>> int width, height;
>> int quality, num_coeffs;
>>
>> if(ctx->picture.data[0]) {
>> avctx->release_buffer(avctx, &ctx->picture);
>> }
>>
>> if(avctx->get_buffer(avctx, &ctx->picture)) {
>> av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
>> return -1;
>> }
>>
>> width = AV_RL16(buf + 4);
>> height = AV_RL16(buf + 6);
>>
>> if(width != ctx->width || height != ctx->height) {
>> av_log(avctx, AV_LOG_ERROR, "resolution changing is not supported\n");
>> return -1;
>> }
>>
>> quality = AV_RL16(buf + 2);
>> is_pframe = AV_RL32(buf + 12);
>> num_coeffs = buf[16];
>
> You should check the buffer size before reading.
Checked.
>> if(!(is_pframe && !ctx->prev_frame.data[0])) {
>> int swap_buf_size = buf_size - 20;
>> ctx->swap_buf = av_fast_realloc(ctx->swap_buf, &ctx->swap_buf_size,
>> swap_buf_size + FF_INPUT_BUFFER_PADDING_SIZE);
>> if(!ctx->swap_buf)
>> return AVERROR_NOMEM;
>>
>> /* TODO find out why dsp.bswap_buf crashes */
>
> Did you ask Valgrind?
Yes. He told me I was reading and writing where I shouldn't have been...
>> bswap_buf(ctx->swap_buf, buf + 20, swap_buf_size);
>> init_get_bits(&ctx->gb, ctx->swap_buf, swap_buf_size << 3);
>>
>> if(!decode(ctx, quality, num_coeffs, is_pframe))
>> return -1;
>> } else {
>> av_log(avctx, AV_LOG_ERROR, "decoding must start with keyframe\n");
>> return -1;
>> }
>>
>> ctx->picture.pict_type = is_pframe ? FF_P_TYPE : FF_I_TYPE;
>> *(AVFrame*)data = ctx->picture;
>> *data_size = sizeof(AVFrame);
>>
>> return buf_size;
>> }
>>
>> static int mimic_decode_end(AVCodecContext *avctx)
>> {
>> MimicContext *ctx = avctx->priv_data;
>> int i;
>>
>> if(ctx->swap_buf)
>> av_free(ctx->swap_buf);
>
> Useless if().
Removed.
Ramiro Polla
More information about the ffmpeg-devel
mailing list