[FFmpeg-devel] [PATCH 01/N] RV30/40 Decoder - Core
Kostya
kostya.shishkov
Sat Dec 8 11:10:17 CET 2007
On Sat, Dec 08, 2007 at 05:32:07AM +0100, Michael Niedermayer wrote:
> On Fri, Dec 07, 2007 at 09:04:07AM +0200, Kostya wrote:
[...]
> > > please check if the following are identical (+-1 doesnt count as
> > > different)
> > >
> > > svq3_get_ue_golomb() and
> > > svq3_get_se_golomb()
> > >
> > > and if they are identical post START/STOP_TIMER scores
> > > also id like to see START/STOP_TIMER scores in that case for the
> > > original vlc based code
> > >
> > > the fastest should be placed in golomb.c/h
> > > and used
> >
> > I don't know when I can do benchmarks but switched to svq3_get_[su]e_golomb()
> > anyway.
>
> ok, just dont forget about the benchmarks please
Hmm, I don't have large and long enough clip to show timer values.
I guess it is fast enough. For the reference, rv34_inv_transform() takes 50 dezicycles.
[...]
> [...]
> > /** essential slice information */
> > typedef struct SliceInfo{
> > int type; ///< slice type (intra, inter)
> > int size; ///< size of the slice in bits
> > int quant; ///< quantizer used for this slice
> > int vlc_set; ///< VLCs used for this slice
> > int start, end; ///< start and end macroblocks of the slice
> > int width; ///< coded width
> > int height; ///< coded height
> > }SliceInfo;
> >
> > /** decoder context */
> > typedef struct RV34DecContext{
> > MpegEncContext s;
> > int8_t *intra_types_hist;///< old block types, used for prediction
> > int8_t *intra_types; ///< block types
> > int block_start; ///< start of slice in blocks
> > uint8_t *luma_dc_quant_i;///< luma subblock DC quantizer for intraframes
> > uint8_t *luma_dc_quant_p;///< luma subblock DC quantizer for interframes
> >
> > RV34VLC *cur_vlcs; ///< VLC set used for current frame decoding
> > int bits; ///< slice size in bits
> > H264PredContext h; ///< functions for 4x4 and 16x16 intra block prediction
> > SliceInfo si; ///< current slice information
> > uint8_t *slice_data; ///< saved slice data
> >
> > int *mb_type; ///< internal macroblock types
> > int block_type; ///< current block type
> > int luma_vlc; ///< which VLC set will be used for decoding of luma blocks
> > int chroma_vlc; ///< which VLC set will be used for decoding of chroma blocks
> > int is16; ///< current block has additional 16x16 specific features or not
> > int dmv[4][2]; ///< differential motion vectors for the current macroblock
> >
> > int rv30; ///< indicates which RV variasnt is currently decoded
> > int rpr; ///< one field size in RV30 slice header
> >
> > int avail[4]; ///< whether left, top, top rights and top left MBs are available
> >
> > int (*parse_slice_header)(struct RV34DecContext *r, GetBitContext *gb, SliceInfo *si);
> > int (*decode_mb_info)(struct RV34DecContext *r);
> > int (*decode_intra_types)(struct RV34DecContext *r, GetBitContext *gb, int8_t *dst);
> > void (*loop_filter)(struct RV34DecContext *r);
> > }RV34DecContext;
>
> SliceInfo.size is redundant with bits
> SliceInfo.quant with qscale
> SliceInfo.start with block_start
dropped size, and start is used in calculations.
Maybe it will be used in parallel decoding as well.
> slice_data is a very ugly way to pass an argument
> id use:
> slice_data = buf + offset;
> rv34_decode_slice(r, r->si.size, r->si.end, &last, slice_data);
> instead of
> r->slice_data = buf + offset;
> rv34_decode_slice(r, r->si.size, r->si.end, &last);
> theres also only one use of it ...
changed
> [...]
> > /** translation of RV30/40 macroblock types to lavc ones */
> > static const int rv34_mb_type_to_lavc[12] = {
> > MB_TYPE_INTRA,
> > MB_TYPE_INTRA16x16,
> > MB_TYPE_16x16 | MB_TYPE_L0,
> > MB_TYPE_8x8 | MB_TYPE_L0,
> > MB_TYPE_16x16 | MB_TYPE_L0,
> > MB_TYPE_16x16 | MB_TYPE_L1,
> > MB_TYPE_SKIP,
>
> > MB_TYPE_DIRECT2,
>
> MB_TYPE_DIRECT2 | MB_TYPE_16x16 | MB_TYPE_L0L1
>
>
> > MB_TYPE_16x8 | MB_TYPE_L0,
> > MB_TYPE_8x16 | MB_TYPE_L0,
>
> > MB_TYPE_DIRECT2,
>
> MB_TYPE_16x16 | MB_TYPE_L0L1,
replaced
> [...]
> > /**
> > * Initialize all tables.
> > */
> > static void rv34_init_tables()
> > {
> > int i, j, k;
> >
> > for(i = 0; i < NUM_INTRA_TABLES; i++){
> > for(j = 0; j < 2; j++){
> > rv34_gen_vlc(rv34_table_intra_cbppat [i][j], CBPPAT_VLC_SIZE, &intra_vlcs[i].cbppattern[j], NULL);
> > rv34_gen_vlc(rv34_table_intra_secondpat[i][j], OTHERBLK_VLC_SIZE, &intra_vlcs[i].second_pattern[j], NULL);
> > rv34_gen_vlc(rv34_table_intra_thirdpat [i][j], OTHERBLK_VLC_SIZE, &intra_vlcs[i].third_pattern[j], NULL);
> > for(k = 0; k < 4; k++)
> > rv34_gen_vlc(rv34_table_intra_cbp[i][j+k*2], CBP_VLC_SIZE, &intra_vlcs[i].cbp[j][k], rv34_cbp_code);
> > }
> > for(j = 0; j < 4; j++)
> > rv34_gen_vlc(rv34_table_intra_firstpat[i][j], FIRSTBLK_VLC_SIZE, &intra_vlcs[i].first_pattern[j], NULL);
> > rv34_gen_vlc(rv34_intra_coeffvlc[i], COEFF_VLC_SIZE, &intra_vlcs[i].coefficient, NULL);
> > }
> >
> > for(i = 0; i < NUM_INTER_TABLES; i++){
> > rv34_gen_vlc(rv34_inter_cbppatvlc[i], CBPPAT_VLC_SIZE, &inter_vlcs[i].cbppattern[0], NULL);
> > for(j = 0; j < 4; j++)
> > rv34_gen_vlc(rv34_inter_cbpvlc[i][j], CBP_VLC_SIZE, &inter_vlcs[i].cbp[0][j], rv34_cbp_code);
>
> the names
> rv34_inter_cbpvlc and rv34_table_intra_cbp are inconsistant with each other
> rv34_inter_cbppatvlc and rv34_table_intra_cbppat are as well
renamed
> [...]
> > /**
> > * @defgroup transform RV30/40 inverse transform functions
> > * @{
> > */
> >
> > static void rv34_row_transform(int temp[16], DCTELEM *block, const int offset)
>
> sould be always_inline
added
> [...]
> > /**
> > * Dequantize ordinary 4x4 block.
> > * @todo optimize
> > */
> > static inline void rv34_dequant4x4(DCTELEM *block, int offset, int Qdc, int Q)
> > {
> > int i, j;
> >
> > block += offset;
> > block[0] = (block[0] * Qdc + 8) >> 4;
> > for(i = 0; i < 4; i++)
> > for(j = !i; j < 4; j++)
> > block[j + i*8] = (block[j + i*8] * Q + 8) >> 4;
> > }
> >
> > /**
> > * Dequantize 4x4 block of DC values for 16x16 macroblock.
> > * @todo optimize
> > */
> > static inline void rv34_dequant4x4_16x16(DCTELEM *block, int offset, int Qdc, int Q)
> > {
> > int i;
> >
> > block += offset;
> > for(i = 0; i < 3; i++)
> > block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Qdc + 8) >> 4;
> > for(; i < 16; i++)
> > block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Q + 8) >> 4;
> > }
> > /** @} */ //block functions
>
> i think it would be more readable if the offset were added outside these
> functions
> rv34_dequant4x4_16x16(a,b,c,d)
> is less clear about what it does than the equivalent
> rv34_dequant4x4_16x16(a+b,c,d)
done for dequant and transform functions
> >
> >
>
> > /**
> > * @defgroup bitstream RV30/40 bitstream parsing
> > * @{
> > */
> >
> > static inline int decode210(GetBitContext *gb){
> > if (get_bits1(gb))
> > return 0;
> > else
> > return 2 - get_bits1(gb);
> > }
>
> duplicate of the equally named function from vc1.c
I will present a patch to move it to bitstream.h
> [...]
> > /**
> > * Decode quantizer difference and return modified quantizer.
> > */
> > static inline int rv34_decode_dquant(GetBitContext *gb, int quant)
> > {
> > if(get_bits1(gb))
> > return rv34_dquant_tab[get_bits1(gb)][quant];
> > else
> > return get_bits(gb, 5);
> > }
>
> btw, doesnt rv34 depend on h263* ? if no then ill accept the above as it would
> be silly to add a dependancy just because of 5 lines of code but if it does
> depend already then you really should use the table and ff_h263_decode_mba()
> from h263 instead of duplicating it!
No, it seems to be not dependent on h.263
> >
> > /** @} */ //bitstream functions
> >
> > /**
> > * @defgroup mv motion vector related code (prediction, reconstruction, motion compensation)
> > * @{
> > */
> >
>
> > /** macroblock partition width in 8x8 blocks */
> > static const uint8_t part_sizes_w[RV34_MB_TYPES] = { 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2 };
> >
> > /** macroblock partition height in 8x8 blocks */
> > static const uint8_t part_sizes_h[RV34_MB_TYPES] = { 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2 };
>
> these 2 tables can be overlapped to safe 7 bytes
> no iam not serious, this is just a joke :)
> but someone should write a linker which "compresses" const tables by by
> attempting to overlap them
let's wait for quantum computing (and quantum gcc bugs)
> [...]
> > /**
> > * Motion vector prediction for B-frames.
> > */
> > static void rv34_pred_mv_b(RV34DecContext *r, int block_type)
> > {
[b-frame mv prediction]
redone it in (hopefully) clearer way
> > V = s->dest[2] + (xoff>>1) + (yoff>>1)*s->uvlinesize;
>
> vertical align nitpick
> Y = s->dest[0] + xoff + yoff *s-> linesize;
> U = s->dest[1] + (xoff>>1) + (yoff>>1)*s->uvlinesize;
> V = s->dest[2] + (xoff>>1) + (yoff>>1)*s->uvlinesize;
aligned
> > if(block_type == RV34_MB_P_16x8){
> > qpel_mc[1][dxy](Y, srcY, s->linesize);
> > Y += 8;
> > srcY += 8;
> > qpel_mc[1][dxy](Y, srcY, s->linesize);
> > chroma_mc[0] (U, srcU, s->uvlinesize, 4, uvmx, uvmy);
> > chroma_mc[0] (V, srcV, s->uvlinesize, 4, uvmx, uvmy);
> > }else if(block_type == RV34_MB_P_8x16){
> > qpel_mc[1][dxy](Y, srcY, s->linesize);
> > Y += 8 * s->linesize;
> > srcY += 8 * s->linesize;
> > qpel_mc[1][dxy](Y, srcY, s->linesize);
> > chroma_mc[1] (U, srcU, s->uvlinesize, 8, uvmx, uvmy);
> > chroma_mc[1] (V, srcV, s->uvlinesize, 8, uvmx, uvmy);
> > }else if(block_type == RV34_MB_P_8x8){
> > qpel_mc[1][dxy](Y, srcY, s->linesize);
> > chroma_mc[1] (U, srcU, s->uvlinesize, 4, uvmx, uvmy);
> > chroma_mc[1] (V, srcV, s->uvlinesize, 4, uvmx, uvmy);
> > }else{
> > qpel_mc[0][dxy](Y, srcY, s->linesize);
> > chroma_mc[0] (U, srcU, s->uvlinesize, 8, uvmx, uvmy);
> > chroma_mc[0] (V, srcV, s->uvlinesize, 8, uvmx, uvmy);
> > }
>
> if(block_type == RV34_MB_P_16x8){
> qpel_mc[1][dxy](Y, srcY, s->linesize);
> Y += 8;
> srcY += 8;
> }else if(block_type == RV34_MB_P_8x16){
> qpel_mc[1][dxy](Y, srcY, s->linesize);
> Y += 8 * s->linesize;
> srcY += 8 * s->linesize;
> }
> qpel_mc[!is16x16][dxy](Y, srcY, s->linesize);
> chroma_mc[2-width] (U, srcU, s->uvlinesize, height*4, uvmx, uvmy);
> chroma_mc[2-width] (V, srcV, s->uvlinesize, height*4, uvmx, uvmy);
done
> [...]
> > case RV34_MB_P_16x8:
> > case RV34_MB_P_8x16:
>
> > case RV34_MB_B_BIDIR:
> > rv34_pred_mv(r, block_type, 0);
> > rv34_pred_mv(r, block_type, 1);
>
> i do not think that using the subblock number to indicate the mv direction is
> a good idea, then i dont mind as long as it doesnt complicate the code ...
it's no more
> [...]
> > for(j = 0; j < 4; j++){
> > no_left = !r->avail[0];
> > YY = Y;
> > for(i = 0; i < 4; i++, cbp >>= 1, YY += 4){
> > no_topright = no_up || (i==3 && j) || (i==3 && !j && (s->mb_x-1) == s->mb_width);
> > rv34_pred_4x4_block(r, YY, s->linesize, ittrans[intra_types[i]], no_up, no_left, i || (j==3), no_topright);
> > no_left = 0;
>
> > if(!(cbp & 1)) continue;
> > rv34_add_4x4_block(YY, s->linesize, s->block[(i>>1)+(j&2)], (i&1)*4+(j&1)*32);
>
> if(cbp)
> rv34_add_4x4_block()
done
> > }
> > no_up = 0;
> > Y += s->linesize * 4;
>
> Y += s->linesize * 4 - 4*4;
> and YY becomes unneeded
done
> [...]
> > dsp->add_pixels_clamped(s->block[0], Y, s->current_picture.linesize[0]);
> > dsp->add_pixels_clamped(s->block[1], Y + 8, s->current_picture.linesize[0]);
> > Y += s->current_picture.linesize[0] * 8;
> > dsp->add_pixels_clamped(s->block[2], Y, s->current_picture.linesize[0]);
> > dsp->add_pixels_clamped(s->block[3], Y + 8, s->current_picture.linesize[0]);
>
> can be vertically aligned
aligned
> [...]
> > if(!r->is16){
> > if(r->decode_intra_types(r, gb, intra_types) < 0)
> > return -1;
> > r->luma_vlc = 1;
> > }else{
> > t = get_bits(gb, 2);
> > for(i = 0; i < 16; i++)
> > intra_types[(i & 3) + (i>>2) * s->b4_stride] = t;
> > r->luma_vlc = 2;
> > }
>
> if(!is16){
> B
> }else{
> A
> }
>
> is less readable than
> if(is16){
> A
> }else{
> B
> }
> IMHO
no difference to me, so changed
> [...]
> > if(!r->is16 && !(cbp & 1)) continue;
> > blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
> > blkoff = ((i & 1) << 2) + ((i & 4) << 3);
> > if(cbp & 1)
> > rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->luma_vlc, 0);
> > if((cbp & 1) || r->is16){
>
> how exactly can this be false here? if it cant the check is useless
dropped
> [...]
> > static int check_slice_end(RV34DecContext *r, MpegEncContext *s)
> > {
> > int bits;
> > if(r->s.mb_skip_run > 1)
> > return 0;
> > if(s->mb_y >= s->mb_height)
> > return 1;
>
> and
> > while(!check_slice_end(r, s) && s->mb_num_left-- && s->mb_y < s->mb_height) {
>
> hmm i dunno, maybe iam tired but this somehow looks hackish
> cant you implement the mb_num_left-- if its needed at a sane point?
reorganized
> [...]
> > *last = 0;
> > if(s->mb_y >= s->mb_height)
> > *last = 1;
> > if(r->bits > get_bits_count(gb) && show_bits(gb, r->bits-get_bits_count(gb)))
> > *last = 1;
> >
> > return 0;
>
> what about using the return value to return the "lastness" ...
why not? done
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
-------------- next part --------------
A non-text attachment was scrubbed...
Name: rv34core.tar.bz2
Type: application/x-bzip2
Size: 10906 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20071208/65fa6ef5/attachment.bin>
More information about the ffmpeg-devel
mailing list