[FFmpeg-devel] [PATCH 01/N] RV30/40 Decoder - Core

Sat Dec 8 11:10:17 CET 2007

On Sat, Dec 08, 2007 at 05:32:07AM +0100, Michael Niedermayer wrote:
> On Fri, Dec 07, 2007 at 09:04:07AM +0200, Kostya wrote:
[...]
> > > please check if the following are identical (+-1 doesnt count as
> > > different)
> > > 
> > > svq3_get_ue_golomb() and 
> > > svq3_get_se_golomb()
> > > 
> > > and if they are identical post START/STOP_TIMER scores
> > > also id like to see START/STOP_TIMER scores in that case for the
> > > original vlc based code
> > > 
> > > the fastest should be placed in golomb.c/h
> > > and used
> > 
> > I don't know when I can do benchmarks but switched to svq3_get_[su]e_golomb()
> > anyway.
> 
> ok, just dont forget about the benchmarks please

Hmm, I don't have large and long enough clip to show timer values.
I guess it is fast enough. For the reference, rv34_inv_transform() takes 50 dezicycles.

[...]
> [...]
> > /** essential slice information */
> > typedef struct SliceInfo{
> >     int type;              ///< slice type (intra, inter)
> >     int size;              ///< size of the slice in bits
> >     int quant;             ///< quantizer used for this slice
> >     int vlc_set;           ///< VLCs used for this slice
> >     int start, end;        ///< start and end macroblocks of the slice
> >     int width;             ///< coded width
> >     int height;            ///< coded height
> > }SliceInfo;
> > 
> > /** decoder context */
> > typedef struct RV34DecContext{
> >     MpegEncContext s;
> >     int8_t *intra_types_hist;///< old block types, used for prediction
> >     int8_t *intra_types;     ///< block types
> >     int block_start;         ///< start of slice in blocks
> >     uint8_t *luma_dc_quant_i;///< luma subblock DC quantizer for intraframes
> >     uint8_t *luma_dc_quant_p;///< luma subblock DC quantizer for interframes
> > 
> >     RV34VLC *cur_vlcs;       ///< VLC set used for current frame decoding
> >     int bits;                ///< slice size in bits
> >     H264PredContext h;       ///< functions for 4x4 and 16x16 intra block prediction
> >     SliceInfo si;            ///< current slice information
> >     uint8_t *slice_data;     ///< saved slice data
> > 
> >     int *mb_type;            ///< internal macroblock types
> >     int block_type;          ///< current block type
> >     int luma_vlc;            ///< which VLC set will be used for decoding of luma blocks
> >     int chroma_vlc;          ///< which VLC set will be used for decoding of chroma blocks
> >     int is16;                ///< current block has additional 16x16 specific features or not
> >     int dmv[4][2];           ///< differential motion vectors for the current macroblock
> > 
> >     int rv30;                ///< indicates which RV variasnt is currently decoded
> >     int rpr;                 ///< one field size in RV30 slice header
> > 
> >     int avail[4];            ///< whether left, top, top rights and top left MBs are available
> > 
> >     int (*parse_slice_header)(struct RV34DecContext *r, GetBitContext *gb, SliceInfo *si);
> >     int (*decode_mb_info)(struct RV34DecContext *r);
> >     int (*decode_intra_types)(struct RV34DecContext *r, GetBitContext *gb, int8_t *dst);
> >     void (*loop_filter)(struct RV34DecContext *r);
> > }RV34DecContext;
> 
> SliceInfo.size  is redundant with bits
> SliceInfo.quant with qscale
> SliceInfo.start with block_start

dropped size, and start is used in calculations.
Maybe it will be used in parallel decoding as well.

> slice_data is a very ugly way to pass an argument
> id use:
>     slice_data = buf + offset;
>     rv34_decode_slice(r, r->si.size, r->si.end, &last, slice_data);
> instead of
>     r->slice_data = buf + offset;
>     rv34_decode_slice(r, r->si.size, r->si.end, &last);
> theres also only one use of it ...

changed

> [...]
> > /** translation of RV30/40 macroblock types to lavc ones */
> > static const int rv34_mb_type_to_lavc[12] = {
> >     MB_TYPE_INTRA,
> >     MB_TYPE_INTRA16x16,
> >     MB_TYPE_16x16 | MB_TYPE_L0,
> >     MB_TYPE_8x8   | MB_TYPE_L0,
> >     MB_TYPE_16x16 | MB_TYPE_L0,
> >     MB_TYPE_16x16 | MB_TYPE_L1,
> >     MB_TYPE_SKIP,
> 
> >     MB_TYPE_DIRECT2,
> 
> MB_TYPE_DIRECT2 | MB_TYPE_16x16 | MB_TYPE_L0L1
> 
> 
> >     MB_TYPE_16x8  | MB_TYPE_L0,
> >     MB_TYPE_8x16  | MB_TYPE_L0,
> 
> >     MB_TYPE_DIRECT2,
> 
> MB_TYPE_16x16 | MB_TYPE_L0L1,

replaced

> [...]
> > /**
> >  * Initialize all tables.
> >  */
> > static void rv34_init_tables()
> > {
> >     int i, j, k;
> > 
> >     for(i = 0; i < NUM_INTRA_TABLES; i++){
> >         for(j = 0; j < 2; j++){
> >             rv34_gen_vlc(rv34_table_intra_cbppat   [i][j], CBPPAT_VLC_SIZE,   &intra_vlcs[i].cbppattern[j],     NULL);
> >             rv34_gen_vlc(rv34_table_intra_secondpat[i][j], OTHERBLK_VLC_SIZE, &intra_vlcs[i].second_pattern[j], NULL);
> >             rv34_gen_vlc(rv34_table_intra_thirdpat [i][j], OTHERBLK_VLC_SIZE, &intra_vlcs[i].third_pattern[j],  NULL);
> >             for(k = 0; k < 4; k++)
> >                 rv34_gen_vlc(rv34_table_intra_cbp[i][j+k*2],  CBP_VLC_SIZE,      &intra_vlcs[i].cbp[j][k],         rv34_cbp_code);
> >         }
> >         for(j = 0; j < 4; j++)
> >             rv34_gen_vlc(rv34_table_intra_firstpat[i][j], FIRSTBLK_VLC_SIZE, &intra_vlcs[i].first_pattern[j], NULL);
> >         rv34_gen_vlc(rv34_intra_coeffvlc[i], COEFF_VLC_SIZE, &intra_vlcs[i].coefficient, NULL);
> >     }
> > 
> >     for(i = 0; i < NUM_INTER_TABLES; i++){
> >         rv34_gen_vlc(rv34_inter_cbppatvlc[i], CBPPAT_VLC_SIZE, &inter_vlcs[i].cbppattern[0], NULL);
> >         for(j = 0; j < 4; j++)
> >             rv34_gen_vlc(rv34_inter_cbpvlc[i][j], CBP_VLC_SIZE, &inter_vlcs[i].cbp[0][j], rv34_cbp_code);
> 
> the names 
> rv34_inter_cbpvlc and rv34_table_intra_cbp are inconsistant with each other
> rv34_inter_cbppatvlc and rv34_table_intra_cbppat are as well

renamed 

> [...]
> > /**
> >  * @defgroup transform RV30/40 inverse transform functions
> >  * @{
> >  */
> > 
> > static void rv34_row_transform(int temp[16], DCTELEM *block, const int offset)
> 
> sould be always_inline

added 

> [...]
> > /**
> >  * Dequantize ordinary 4x4 block.
> >  * @todo optimize
> >  */
> > static inline void rv34_dequant4x4(DCTELEM *block, int offset, int Qdc, int Q)
> > {
> >     int i, j;
> > 
> >     block += offset;
> >     block[0] = (block[0] * Qdc + 8) >> 4;
> >     for(i = 0; i < 4; i++)
> >         for(j = !i; j < 4; j++)
> >             block[j + i*8] = (block[j + i*8] * Q + 8) >> 4;
> > }
> > 
> > /**
> >  * Dequantize 4x4 block of DC values for 16x16 macroblock.
> >  * @todo optimize
> >  */
> > static inline void rv34_dequant4x4_16x16(DCTELEM *block, int offset, int Qdc, int Q)
> > {
> >     int i;
> > 
> >     block += offset;
> >     for(i = 0; i < 3; i++)
> >          block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Qdc + 8) >> 4;
> >     for(; i < 16; i++)
> >          block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Q + 8) >> 4;
> > }
> > /** @} */ //block functions
> 
> i think it would be more readable if the offset were added outside these
> functions
> rv34_dequant4x4_16x16(a,b,c,d)
> is less clear about what it does than the equivalent
> rv34_dequant4x4_16x16(a+b,c,d)

done for dequant and transform functions 

> > 
> > 
> 
> > /**
> >  * @defgroup bitstream RV30/40 bitstream parsing
> >  * @{
> >  */
> > 
> > static inline int decode210(GetBitContext *gb){
> >     if (get_bits1(gb))
> >         return 0;
> >     else
> >         return 2 - get_bits1(gb);
> > }
> 
> duplicate of the equally named function from vc1.c

I will present a patch to move it to bitstream.h  

> [...]
> > /**
> >  * Decode quantizer difference and return modified quantizer.
> >  */
> > static inline int rv34_decode_dquant(GetBitContext *gb, int quant)
> > {
> >     if(get_bits1(gb))
> >         return rv34_dquant_tab[get_bits1(gb)][quant];
> >     else
> >         return get_bits(gb, 5);
> > }
> 
> btw, doesnt rv34 depend on h263* ? if no then ill accept the above as it would
> be silly to add a dependancy just because of 5 lines of code but if it does
> depend already then you really should use the table and ff_h263_decode_mba()
> from h263 instead of duplicating it!

No, it seems to be not dependent on h.263

> > 
> > /** @} */ //bitstream functions
> > 
> > /**
> >  * @defgroup mv motion vector related code (prediction, reconstruction, motion compensation)
> >  * @{
> >  */
> > 
> 
> > /** macroblock partition width in 8x8 blocks */
> > static const uint8_t part_sizes_w[RV34_MB_TYPES] = { 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2 };
> > 
> > /** macroblock partition height in 8x8 blocks */
> > static const uint8_t part_sizes_h[RV34_MB_TYPES] = { 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2 };
> 
> these 2 tables can be overlapped to safe 7 bytes
> no iam not serious, this is just a joke :)
> but someone should write a linker which "compresses" const tables by by
> attempting to overlap them

let's wait for quantum computing (and quantum gcc bugs) 

> [...]
> > /**
> >  * Motion vector prediction for B-frames.
> >  */
> > static void rv34_pred_mv_b(RV34DecContext *r, int block_type)
> > {
[b-frame mv prediction]

redone it in (hopefully) clearer way

> >     V = s->dest[2] + (xoff>>1) + (yoff>>1)*s->uvlinesize;
> 
> vertical align nitpick
> Y = s->dest[0] +  xoff     +  yoff    *s->  linesize;
> U = s->dest[1] + (xoff>>1) + (yoff>>1)*s->uvlinesize;
> V = s->dest[2] + (xoff>>1) + (yoff>>1)*s->uvlinesize;

aligned 

> >     if(block_type == RV34_MB_P_16x8){
> >         qpel_mc[1][dxy](Y, srcY, s->linesize);
> >         Y    += 8;
> >         srcY += 8;
> >         qpel_mc[1][dxy](Y, srcY, s->linesize);
> >         chroma_mc[0]   (U, srcU, s->uvlinesize, 4, uvmx, uvmy);
> >         chroma_mc[0]   (V, srcV, s->uvlinesize, 4, uvmx, uvmy);
> >     }else if(block_type == RV34_MB_P_8x16){
> >         qpel_mc[1][dxy](Y, srcY, s->linesize);
> >         Y    += 8 * s->linesize;
> >         srcY += 8 * s->linesize;
> >         qpel_mc[1][dxy](Y, srcY, s->linesize);
> >         chroma_mc[1]   (U, srcU, s->uvlinesize, 8, uvmx, uvmy);
> >         chroma_mc[1]   (V, srcV, s->uvlinesize, 8, uvmx, uvmy);
> >     }else if(block_type == RV34_MB_P_8x8){
> >         qpel_mc[1][dxy](Y, srcY, s->linesize);
> >         chroma_mc[1]   (U, srcU, s->uvlinesize, 4, uvmx, uvmy);
> >         chroma_mc[1]   (V, srcV, s->uvlinesize, 4, uvmx, uvmy);
> >     }else{
> >         qpel_mc[0][dxy](Y, srcY, s->linesize);
> >         chroma_mc[0]   (U, srcU, s->uvlinesize, 8, uvmx, uvmy);
> >         chroma_mc[0]   (V, srcV, s->uvlinesize, 8, uvmx, uvmy);
> >     }
> 
> if(block_type == RV34_MB_P_16x8){
>     qpel_mc[1][dxy](Y, srcY, s->linesize);
>     Y    += 8;
>     srcY += 8;
> }else if(block_type == RV34_MB_P_8x16){
>     qpel_mc[1][dxy](Y, srcY, s->linesize);
>     Y    += 8 * s->linesize;
>     srcY += 8 * s->linesize;
> }
> qpel_mc[!is16x16][dxy](Y, srcY, s->linesize);
> chroma_mc[2-width]   (U, srcU, s->uvlinesize, height*4, uvmx, uvmy);
> chroma_mc[2-width]   (V, srcV, s->uvlinesize, height*4, uvmx, uvmy);

done

> [...]
> >     case RV34_MB_P_16x8:
> >     case RV34_MB_P_8x16:
> 
> >     case RV34_MB_B_BIDIR:
> >         rv34_pred_mv(r, block_type, 0);
> >         rv34_pred_mv(r, block_type, 1);
> 
> i do not think that using the subblock number to indicate the mv direction is
> a good idea, then i dont mind as long as it doesnt complicate the code ...

it's no more 

> [...]
> >         for(j = 0; j < 4; j++){
> >             no_left = !r->avail[0];
> >             YY = Y;
> >             for(i = 0; i < 4; i++, cbp >>= 1, YY += 4){
> >                 no_topright = no_up || (i==3 && j) || (i==3 && !j && (s->mb_x-1) == s->mb_width);
> >                 rv34_pred_4x4_block(r, YY, s->linesize, ittrans[intra_types[i]], no_up, no_left, i || (j==3), no_topright);
> >                 no_left = 0;
> 
> >                 if(!(cbp & 1)) continue;
> >                 rv34_add_4x4_block(YY, s->linesize, s->block[(i>>1)+(j&2)], (i&1)*4+(j&1)*32);
> 
> if(cbp)
>     rv34_add_4x4_block()

done

> >             }
> >             no_up = 0;
> >             Y += s->linesize * 4;
> 
> Y += s->linesize * 4 - 4*4;
> and YY becomes unneeded

done

> [...]
> >         dsp->add_pixels_clamped(s->block[0], Y, s->current_picture.linesize[0]);
> >         dsp->add_pixels_clamped(s->block[1], Y + 8, s->current_picture.linesize[0]);
> >         Y += s->current_picture.linesize[0] * 8;
> >         dsp->add_pixels_clamped(s->block[2], Y, s->current_picture.linesize[0]);
> >         dsp->add_pixels_clamped(s->block[3], Y + 8, s->current_picture.linesize[0]);
> 
> can be vertically aligned

aligned 

> [...]
> >         if(!r->is16){
> >             if(r->decode_intra_types(r, gb, intra_types) < 0)
> >                 return -1;
> >             r->luma_vlc   = 1;
> >         }else{
> >             t = get_bits(gb, 2);
> >             for(i = 0; i < 16; i++)
> >                 intra_types[(i & 3) + (i>>2) * s->b4_stride] = t;
> >             r->luma_vlc   = 2;
> >         }
> 
> if(!is16){
>     B
> }else{
>     A
> }
> 
> is less readable than
> if(is16){
>     A
> }else{
>     B
> }
> IMHO

no difference to me, so changed

> [...]
> >         if(!r->is16 && !(cbp & 1)) continue;
> >         blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
> >         blkoff = ((i & 1) << 2) + ((i & 4) << 3);
> >         if(cbp & 1)
> >             rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->luma_vlc, 0);
> >         if((cbp & 1) || r->is16){
> 
> how exactly can this be false here? if it cant the check is useless

dropped 

> [...]
> > static int check_slice_end(RV34DecContext *r, MpegEncContext *s)
> > {
> >     int bits;
> >     if(r->s.mb_skip_run > 1)
> >         return 0;
> >     if(s->mb_y >= s->mb_height)
> >         return 1;
> 
> and
> >     while(!check_slice_end(r, s) && s->mb_num_left-- && s->mb_y < s->mb_height) {
> 
> hmm i dunno, maybe iam tired but this somehow looks hackish
> cant you implement the mb_num_left-- if its needed at a sane point?

reorganized 

> [...]
> >     *last = 0;
> >     if(s->mb_y >= s->mb_height)
> >         *last = 1;
> >     if(r->bits > get_bits_count(gb) && show_bits(gb, r->bits-get_bits_count(gb)))
> >         *last = 1;
> > 
> >     return 0;
> 
> what about using the return value to return the "lastness" ...

why not? done

> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
-------------- next part --------------
A non-text attachment was scrubbed...
Name: rv34core.tar.bz2
Type: application/x-bzip2
Size: 10906 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20071208/65fa6ef5/attachment.bin>