[Ffmpeg-devel] [RFC] VC3/DNxHD decoder

Sun Mar 18 01:30:29 CET 2007

Hi

On Sat, Mar 17, 2007 at 03:40:19PM +0100, Baptiste Coudurier wrote:
> Hi
> 
> $subject.
> 
> I need to include "mpegvideo.h" only for Scantable and
> ff_init_scantable, can we separate them from mpegvideo.h ? like
> scantable.h ?
> 
> It decodes 1080p atm, but you should only need to add ac/dc/run vlc
> tables to decode other progressive modes. Interlaced mode is almost
> finished, bitstream is one complete frame but 2 sequentials fields
> bitstreams.
> 
> Also if someone could explain me how CODEC_CAP_DR1 and draw_horiz_band
> and low_res works, I can implement them.
> 
> Optimization ideas very welcome, same for dsp functions use.

[...]

> +    int cid; //<< compression id

hmm shouldnt it be ///<

[...]
> +    DCTELEM blocks[8][64];

this should be DECLARE_ALIGNED_16

> +    DECLARE_ALIGNED_8(ScanTable, scantable);
> +    const CIDEntry *cid_table;
> +} DNXHDContext;
> +
> +static const CIDEntry cid_table[] = {
> +    { 1238, 1920, 1080, 0, 917504, 4, 8,
> +      dnxhd_1238_luma_weigth, dnxhd_1238_chroma_weigth,
> +      dnxhd_1238_dc_codes, dnxhd_1238_dc_bits,
> +      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level,
> +      dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag,
> +      dnxhd_1238_run_codes, dnxhd_1238_run_bits, dnxhd_1238_run_level },
> +/*     { 1243, 1920, 1080, 1, 917504, 4, 8, */
> +/*       dnxhd_1243_luma_weigth, dnxhd_1243_chroma_weigth, */
> +/*       dnxhd_1238_dc_codes, dnxhd_1238_dc_bits, */
> +/*       dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level, */
> +/*       dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag, */
> +/*       dnxhd_1238_run_codes, dnxhd_1238_run_bits, dnxhd_1238_run_level }, */
> +};

id suggest that all values which are always the same no matter which cid
should be hardcoded as that should be faster

[...]
> +        init_vlc(&ctx->dc_vlc, DNXHD_VLC_BITS, 12,
> +                 cid_table->dc_bits, 1, 1,
> +                 cid_table->dc_codes, 1, 1, 0);

the longest dc bits is just 6 so DNXHD_VLC_BITS, which is 9 would waste
memory and cache

[...]
> +        for(i = 0; i < 64; i++)
> +            ctx->luma_weigth[dnxhd_zigzag[i]] = cid_table->luma_weigth[i];
> +        for(i = 0; i < 64; i++)
> +            ctx->chroma_weigth[dnxhd_zigzag[i]] = cid_table->chroma_weigth[i];

you could store these tables already permutated in dnxhd_zigzag order in the
header

[...]
> +    av_log(ctx->avctx, AV_LOG_DEBUG, "mb width %d, mb height %d\n", ctx->mb_width, ctx->mb_height);
> +    for (i = 0; i < ctx->mb_height; i++) {
> +        ctx->mb_scan_index[i] = AV_RB32(buf + 0x170 + (i<<2));
> +        av_log(ctx->avctx, AV_LOG_DEBUG, "mb scan index %d\n", ctx->mb_scan_index[i]);

i think you should check that mb_scan_index is within the buffer

[...]

> +static int dnxhd_decode_dc(DNXHDContext *ctx)
> +{
> +    int len;
> +    unsigned int p = 0;
> +    int e = 0;
> +
> +    len = get_vlc2(&ctx->gb, ctx->dc_vlc.table, DNXHD_VLC_BITS, 1);
> +    if (len) {
> +        p = get_bits(&ctx->gb, len);
> +        //av_log(ctx->avctx, AV_LOG_DEBUG, "p %d, len %d\n", p, len);
> +        e = p >= (1<<(len-1)) ? p : p + 1 - (1<<len);

hmm try get_xbits() maybe it does the same

> +    }
> +    return e;
> +}
> +
> +static int dnxhd_decode_dct_block(DNXHDContext *ctx, DCTELEM *block, int n, int qscale)
> +{
> +    int dc, i, j, index, index2;
> +    int16_t level;
> +    int component, sign;
> +    const uint8_t *quant_matrix;
> +    int diff;
> +
> +    if (n == 2 || n == 6) {
> +        component = 1;
> +        quant_matrix = ctx->chroma_weigth;
> +    } else if (n == 3 || n == 7) {
> +        component = 2;
> +        quant_matrix = ctx->chroma_weigth;
> +    } else {
> +        component = 0;
> +        quant_matrix = ctx->luma_weigth;
> +    }

if(n&2){
    component= 1 + (n&1);
    quant_matrix = ctx->chroma_weigth;
}else{
    component = 0;
    quant_matrix = ctx->luma_weigth;
}

> +    diff = dnxhd_decode_dc(ctx);
> +    dc = ctx->last_dc[component];
> +    dc += diff;
> +    ctx->last_dc[component] = dc;
> +    block[0] = dc;

ctx->last_dc[component] += dnxhd_decode_dc(ctx);
block[0]= ctx->last_dc[component];

> +    //av_log(ctx->avctx, AV_LOG_DEBUG, "dc %d, diff %d\n", dc, diff);
> +    for (i = 1; i < 65; i++) {
> +        index = get_vlc2(&ctx->gb, ctx->ac_vlc.table, DNXHD_VLC_BITS, 2);
> +        //av_log(ctx->avctx, AV_LOG_DEBUG, "index %d\n", index);
> +        if (index == 4) { /* EOB */
> +            //av_log(ctx->avctx, AV_LOG_DEBUG, "EOB\n");
> +            break;
> +        }
> +
> +        if (i > 63) {
> +            av_log(ctx->avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", n, i);
> +            return -1;
> +        }
> +
> +        sign = get_bits1(&ctx->gb) ? -1 : 1;
> +
> +        level = ctx->ac_level[index];
> +        if (ctx->ac_index_flag[index]) {
> +            level += get_bits(&ctx->gb, ctx->index_bits)<<6;
> +        }
> +
> +        if (ctx->ac_run_flag[index]) {

index= get_vlc()
ctx->ac_level[index]
ctx->ac_index_flag[index]
ctx->ac_run_flag[index]

is quite inefficint as it needs 3 reads for the pointers to the tables
3 more reads with base+index to get the actual 3 values

by chaning the tables this could be done for example like
level= get_vlc()
level & 0x400
level & 0x800
level &=0x3FF

you avoid all thouse reads

> +            index2 = get_vlc2(&ctx->gb, ctx->run_vlc.table, DNXHD_VLC_BITS, 2);
> +            i += ctx->run_level[index2];

i think run_level is not a good name, i see nothing related to the level
here ;)

> +        }
> +
> +        j = ctx->scantable.permutated[i];
> +        //av_log(ctx->avctx, AV_LOG_DEBUG, "j %d\n", j);
> +        if (level) {

the if(level) is useless it must be always true at least with the current
tables and i think tables for which its false make no sense ...

> +            //av_log(ctx->avctx, AV_LOG_DEBUG, "level %d, quant %d\n", level, quant_matrix[i]);
> +            level = level * qscale * quant_matrix[i] + ((qscale * quant_matrix[i])>>1);
> +            if (quant_matrix[i] != 32) // FIXME 10bit
> +                level += 16;
> +            //av_log(ctx->avctx, AV_LOG_DEBUG, "temp %d ", level);
> +            level = sign * (level>>5); // FIXME 10bit

sign= get_bits1(&ctx->gb);

and

level= (2*level+1) * qscale * quant_matrix[i];
if (quant_matrix[i] != 32) // FIXME 10bit
    level += 32;
level >>= 6;
if(sign)
    level= -level

you could also try to simply use something like
level += quant_bias[i];
maybe its faster, maybe not ...

or maybe the following is faster ...

sign= get_sbits(&ctx->gb, 1);

level= (level^sign)-sign;

you could also try to merge the (2*level+1) into the level table though it
would complicat the ctx->ac_index_flag[index]==1 case a little but it still
might be faster as i think ctx->ac_index_flag[index]==1 is rare

> +        }
> +
> +        //av_log(NULL, AV_LOG_DEBUG, "i %d, j %d, end level %d\n", i, j, level);
> +        block[j] = level;
> +    }

the end checking of the whole loop can also be improved

instead of

for (i = 1; i < 65; i++) {
    if(EOB) break;
    if(i>63) return -1;
    if(flag)
        i+= get_vlc()
    write data based on i
}

id suggest

for (i = 1; ; i++) {
    if(EOB) break;
    if(flag)
        i+= get_vlc()
    if(i>63) return -1;
    write data based on i
}

[...]
> +static int dnxhd_decode_macroblock(DNXHDContext *ctx, int x, int y)
> +{
> +    int dct_linesize_luma   = ctx->picture.linesize[0];
> +    int dct_linesize_chroma = ctx->picture.linesize[1];
> +    uint8_t *dest_y, *dest_u, *dest_v;
> +    int dct_offset;
> +    int qscale, i, j, k;
> +
> +    //ctx->dsp.clear_blocks(ctx->blocks[0]);
> +
> +    memset(ctx->blocks, 0, sizeof(ctx->blocks)); //FIXME dsp clear blocks should take number of blocks

doing the clearing after put_pixels_clamped() may (or may not) be faster due
to cache effects

[...]
> +    for (i = 0; i < 8; i++) {
> +        dnxhd_decode_dct_block(ctx, ctx->blocks[i], i, qscale);
> +        ctx->dsp.idct(ctx->blocks[i]);

> +        for (j = 0; j < 8; j++)
> +            for (k = 0; k < 8; k++)
> +                ctx->blocks[i][j + (k<<3)] += 128; // adjust levels

setting last_dc to 1<<something instead of 0 at the begin of each row should
have the same effect and be quite a bit faster

and ctx->dsp.idct_put() could be used

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Why not whip the teacher when the pupil misbehaves? -- Diogenes of Sinope
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20070318/8c5c1df1/attachment.pgp>