[Ffmpeg-devel] [RFC] [PATCH] FLC/FLX/DTA encoder

Tue Feb 27 01:48:05 CET 2007

Hi,

> > +static int encode_black(AVCodecContext *avctx, AVFrame *p, unsigned char *buf) {
> > +    int i;
> > +
> > +    // FIXME: make it faster
> > +    for (i = 0; i < avctx->height*p->linesize[0]; i++)
> > +        if (p->data[0][i] != 0)
> > +            break;
> > +
> > +    if (i != avctx->height*p->linesize[0])
> > +        return -1; // not full black
> 
> for (i = 0; i < avctx->height*p->linesize[0]; i++)
>     if (p->data[0][i])
>         return -1; // not full black

ok

> > +static int encode_copy(AVCodecContext *avctx, AVFrame *p, unsigned char *buf) {
> > +    int pos = 6, i;
> > +    const int stride = avctx->width*((avctx->bits_per_sample+7)/8);
> 
> as avctx->width*((avctx->bits_per_sample+7)/8) occurs several times maybe a
> bytes_per_line in FlicEncodeContext would be a good idea?

I may add it.

> [...]
> > +static int encode_brun(AVCodecContext *avctx, AVFrame *p, unsigned char *buf) {
> > +    int lines, pos = 6, y_ptr = 0, pixel_ptr;
> > +    unsigned char *pixels = p->data[0];
> > +    const int stride = avctx->width*((avctx->bits_per_sample+7)/8);
> > +//    const int stride = avctx->width;
> > +
> > +    AV_WL16(buf+4, FLI_BRUN); // type
> > +    
> > +    for (lines = 0; lines < avctx->height; lines++) {
> > +	int nppos = pos, npackets = 0;
> > +	int pixel_countdown = stride;
> > +	int copy_pos = pos; 
> > +	unsigned int run_count = 0;
> > +
> > +	pixel_ptr = y_ptr;
> > +
> > +	pos++; // number of packets, patched later
> > +
> > +	while (pixel_countdown > 0) {
> > +	    // no point checking for a run if only 1 byte left
> > +	    if ((pixel_countdown > 2) && 
> > +		(pixels[pixel_ptr] == pixels[pixel_ptr+1]) &&
> > +		(pixels[pixel_ptr] == pixels[pixel_ptr+2])) {
> > +		// 3 bytes the same in a row, so start counting the run
> > +		// 3 is the threshold, because a run of two forces and extra byte
> > +		
> > +		// flush old run
> > +		if (run_count != 0) {
> > +		    AV_WL8(buf+copy_pos, (signed char)(0-run_count));
> > +		    run_count = 0;
> > +		}
> > +
> > +		run_count = 3;
> > +		while ((pixels[pixel_ptr] == pixels[pixel_ptr+run_count]) && 
> > +		       (run_count < pixel_countdown) && 
> > +		       (run_count < 127)) {
> > +		    run_count++;
> > +		}
> > +		npackets++;
> > +		AV_WL8(buf+pos, (signed char)(run_count));
> 
> what does the signed char cast do here?

The field must be signed, as negative run_count means copy, while
positive means run. Or vice versa. Better if I add AV_WL8S ? I dont
think.

> > +		pos++;
> > +		AV_WL8(buf+pos, pixels[pixel_ptr]);
> > +		pos++;
> > +		pixel_ptr       += run_count;
> > +		pixel_countdown -= run_count;
> > +		run_count = 0;
> > +	    } else {
> > +	        // 3 bytes in a row not able to be RLE'd, so emit the copy
> > +
> > +		// flush old run
> > +	        if (run_count == 128) {
> > +		    AV_WL8(buf+copy_pos, (signed char)(0-run_count));
> 
> AV_WL8(buf+copy_pos, 128);

-128

> > +		    run_count = 0;
> > +                }
> > +
> > +		// start of a copy run
> > +		if (run_count == 0)
> > +		{
> > +		    copy_pos = pos; // written later 
> > +		    pos++;
> > +		    npackets++;
> > +		}
> > +		run_count++;
> > +		AV_WL8(buf+pos, pixels[pixel_ptr]);
> > +		pos++;
> > +		pixel_ptr++;
> > +		pixel_countdown--;
> > +	    }
> > +        }
> > +
> > +	// flush old run
> > +	if (run_count != 0) {
> > +	    AV_WL8(buf+copy_pos, (signed char)(0-run_count));
> > +	    run_count = 0;
> > +        }
> > +
> > +	// this frame would be invalid this way, so just report failed
> > +	// FIXME: try solvin it in a better way
> > +	if (npackets > 255) 
> > +	    return -1;
> > +
> > +	AV_WL8(buf+nppos, npackets);
> 
> what about
> 
> while (pixel < end) {
>     int run=1;
> 
>     while(pixel+1 < end && run < 127 && pixel[0] == pixel[1]){
>         run++;
>         pixel++;
>     }
>     if(run > limit){
>         *buf++= run;
>         *buf++= *pixel++;
>     }else{
>         pixel -= run - 2;
>         run=1;
>         while(pixel+1 < end && run < 127 && pixel[0] != pixel[1]){
>             run++;
>             pixel++;
>         }
>         *buf++= - run;
>         memcpy(buf, pixel-run, run);
>         buf+= run;
>     }
>     npackets++;
>     if(npackets>255){
>         limit++;
>         goto retry_from_begin;
>     }
> }

I was not using gotos.

> > +static int encode_dta_brun(AVCodecContext *avctx, AVFrame *p, unsigned char *buf) {
> > +    int lines, pos = 6, y_ptr = 0, pixel_ptr;
> > +    unsigned short *pixels = (unsigned short*)p->data[0];
> > +
> > +    // FIXME: fix this to handle 24bit input too
> > +    if (avctx->codec_id != CODEC_ID_FLIDTA ||
> > +	avctx->bits_per_sample == 8 ||
> > +	avctx->bits_per_sample == 24)
> > +	return -1;
> > +
> > +    AV_WL16(buf+4, FLI_DTA_BRUN); // type
> > +    
> > +    for (lines = 0; lines < avctx->height; lines++) {
> > +	int nppos = pos, npackets = 0;
> > +	int pixel_countdown = avctx->width;
> > +	int copy_pos = pos; 
> > +	unsigned int run_count = 0;
> > +
> > +	pixel_ptr = y_ptr;
> > +
> > +	pos++; // number of packets, patched later
> > +
> > +	while (pixel_countdown > 0) {
> > +	    // no point checking for a run if only 1 byte left
> > +	    if ((pixel_countdown > 1) && 
> > +		(pixels[pixel_ptr] == pixels[pixel_ptr+1])) {
> > +		// 2 bytes the same in a row, so start counting the run
> > +		
> > +		// flush old run
> > +		if (run_count != 0) {
> > +		    AV_WL8(buf+copy_pos, (signed char)(0-run_count));
> > +		    run_count = 0;
> > +		}
> > +
> > +		run_count = 2;
> > +		while ((pixels[pixel_ptr] == pixels[pixel_ptr+run_count]) && 
> > +		       (run_count < pixel_countdown) && 
> > +		       (run_count < 127)) {
> > +		    run_count++;
> > +		}
> > +		npackets++;
> > +		AV_WL8(buf+pos, (signed char)(run_count));
> > +		pos++;
> > +		AV_WL16(buf+pos, pixels[pixel_ptr]);
> > +		pos += 2;
> > +		pixel_ptr       += run_count;
> > +		pixel_countdown -= run_count;
> > +		run_count = 0;
> > +	    } else {
> > +		// 2 bytes in a row not able to be RLE'd, so emit the copy
> > +		
> > +		// flush old run
> > +	        if (run_count == 128) {
> > +		    AV_WL8(buf+copy_pos, (signed char)(0-run_count));
> > +		    run_count = 0;
> > +                }
> > +
> > +                // start of a copy run
> > +		if (run_count == 0)
> > +		{
> > +		    copy_pos = pos; // written later 
> > +		    pos++;
> > +		    npackets++;
> > +		}
> > +		run_count++;
> > +		AV_WL16(buf+pos, pixels[pixel_ptr]);
> > +		pos += 2;
> > +		pixel_ptr++;
> > +		pixel_countdown--;
> > +	    }
> > +        }
> > +
> > +	// flush old run
> > +	if (run_count != 0) {
> > +	    AV_WL8(buf+copy_pos, (signed char)(0-run_count));
> > +	    run_count = 0;
> > +        }
> > +
> > +	// this frame would be invalid this way, so just report failed
> > +	// FIXME: try solvin it in a better way
> > +	if (npackets > 255) 
> > +	    return -1;
> > +
> > +	AV_WL8(buf+nppos, npackets);
> > +	
> > +	y_ptr += p->linesize[0]/((avctx->bits_per_sample+7)/8);
> > +    }
> > +
> > +    // padding
> > +    if ((pos % 2) == 1) {
> > +	AV_WL8(buf+pos, 0);
> > +	pos++;
> > +    }
> > +        
> > +    AV_WL32(buf, pos); // size field
> > +
> > +    return pos;
> > +}
> 
> very similar to encode_brun, this should be merged

With two exceptions: the run_count field is exchanged (- means X, +
means Y) and it works on pixels and not bytes. Its splitted for speed
reason.

> > +static int encode_lc(AVCodecContext *avctx, AVFrame *p, unsigned char *buf) {
> 
> this too and several other functions are very similar to the previous rle
> encoders and must be merged

Similar to encode_brun in principle, as all the RLE is based on the same
principle. I may add put_run() and put_copy() if that factorizing
satisfies you.

> > +// 8 bit FLC DELTA
> > +static int encode_delta8(AVCodecContext *avctx, AVFrame *p, unsigned char *buf) {
> [...]
> > +			    while (run_count > 254) {
> > +				npackets++;
> > +				AV_WL8(buf+pos, 254); // skip
> > +				pos++;
> > +				AV_WL8(buf+pos, 0x00); 
> > +				pos++;
> 
> these really should be *bug++= X or bytestream_put_be16() could be used both
> should be more readable

I thought about moving to bytestream_ already, but the time I wrote it
there was no such layer yet.

> also the function is VERY messy and full of duplicated code

Yep, this function should be cleaned up in a way.

> > +    // FIXME: this is rather unoptimal, but simple
> > +    for (i = 0; flic_encoders[i>>1].encode != NULL; i++) {
> > +        if ((avctx->codec_id == CODEC_ID_FLIC && !flic_encoders[i>>1].is_flc) ||
> > +            (avctx->codec_id == CODEC_ID_FLIX && !flic_encoders[i>>1].is_flx) ||
> > +            (avctx->codec_id == CODEC_ID_FLIDTA && !flic_encoders[i>>1].is_dta))
> > +            continue;
> > +        if (i & 1) {
> > +            if (avctx->bits_per_sample == 8 && remapped)
> > +                sizes[i] = flic_encoders[i>>1].encode(avctx, &(s->remap_frame), buf+chunkpos);
> > +            else
> > +                sizes[i] = -1;
> 
> using INT_MAX should avoid a few checks below

You prefer it that way?

> > /*
> >  * Palette optimisations
> >  * Copyright (c) 2006 Sakura Industries Ltd.
> >  * Author: Steven Johnson
> 
> the palette stuff belongs into a seperate patch!

Anyway, would you accept this (palette.c) theoretically? If yes, where
should it reside? swscaler? libavcodec?

> [...]
> > typedef struct IFR_Match {
> >     uint32_t CurIdx;			  /// Index in Current
> >     uint32_t PrevIdx;			  /// that matches this Index in previous
> > 
> >     void *AsocState;                      /// Associated state -needed for qsort.
> > } IFR_Match;
> 
> IIRC doxygen wants ///< for comments placed like that

Ok

> [...]
> > static int ifr_matchcompare(const void* a, const void* b) {
> >     // We want qsort to sort in ascending order, so comparison test is reversed.
> >     IFR_Match *f = *(IFR_Match**) a; 
> >     IFR_Match *s = *(IFR_Match**) b;
> >     
> >     // note: f->AsocState->ConcordanceMatrix must == s->AsocState->ConcordanceMatrix
> >     // so we use the same matrix, from f as it should generate more efficient code.
> >     IFRPaletteOptimiseState *state = (IFRPaletteOptimiseState *)f->AsocState;
> > 
> >     if (state->ConcordanceMatrix[f->PrevIdx][f->CurIdx] <
> >         state->ConcordanceMatrix[s->PrevIdx][s->CurIdx]) {
> > 	return 1;
> >     } else if (state->ConcordanceMatrix[f->PrevIdx][f->CurIdx] >
> >                state->ConcordanceMatrix[s->PrevIdx][s->CurIdx]) {
> > 	return -1;
> >     }
> 
> return value0-value1; should do

done

> >     return 0; // Must be equal.
> > }
> > 
> > int ff_ifr_optimise(AVFrame *cur, AVFrame *prev, AVFrame *remap, int width, int height)
> > {
> 
> very very messy
> 
> concordance[256*256][2];
> for all pixels
>     concordance[256*prevpix + pixel][0]++;
> for all x
>     concordance[x][1]= x;
> 
> sort concordance[2] elements per [0]
> 
> for(x=0; x<256*256; x++){
>     int match= concordance[x][1];
>     int m0= match & 255;
>     int m1= match >> 8;
>     if(mapped[0][m0]>=0 || mapped[1][m1]>=0)
>         continue;
>     mapped[0][m0]= m1;
>     mapped[1][m1]= m0;
> }
> 
> PS: the solution is of course not optimal but i see no obvious way how to
> find the optimal solution in P time

I never reviewed that code and I am not familiar with it, I would ask
the author.

Steve, could you comment on this?

--
Alex Beregszaszi