[FFmpeg-devel] Indeo3 replacement, part 2

Fri Sep 25 14:00:10 CEST 2009

On Tue, Sep 22, 2009 at 12:09:23AM +0200, Maxim wrote:
> /**
>  *  Some constants for parsing frame flags.
>  */
> enum {
>     BS_8BIT_PEL  = 1<<1,
>     BS_KEYFRAME  = 1<<2,
>     BS_MV_Y_HALF = 1<<4,
>     BS_MV_X_HALF = 1<<5,
>     BS_NONREF    = 1<<8,
>     BS_BUFFER    = 9

Using an enum for flags is a but questionable IMO, but no big deal.

> #ifdef HAVE_BIGENDIAN
>             corr = (a << 8) + b;
> #else
>             corr = (b << 8) + a;
> #endif
>             delta_tab[i] = corr;
>             *sel_tab++   = DELTA_DYAD; /* set the code type = dyad */
> 
>             /* build the low-order corrector for the MODE 10 (8x8 processing) */
>             /* by replicating each vq delta A and B as follows: AABB. */
>             /* The high-order dword is set = 0. */
> #ifdef HAVE_BIGENDIAN
>             corr = (a << 8) + a;
>             corr = (corr << 8) + b;
>             corr = (corr << 8) + b;
> #else
>             corr = (b << 8) + b;
>             corr = (corr << 8) + a;
>             corr = (corr << 8) + a;
> #endif

You could probably (mis-)use MKTAG or something here.
But something is wrong here, HAVE_BIGENDIAN is always defined, it is
just either 0 or 1.

>         if (quads >= 0) {
>             for (j = 0; j < quads; j++) {
>                 for (i = 0; i < quads; i++) {
>                     /* concatenate two dyads to form a four-pixel corrector (quad) */
> #ifdef HAVE_BIGENDIAN
>                     *quad_tab++ = (delta_tab[j] << 16) + delta_tab[i];
> #else
>                     *quad_tab++ = (delta_tab[i] << 16) + delta_tab[j];
> #endif
>                     *sel_tab++  = DELTA_QUAD; /* set the code selector = quad */
> 
>                     /* build the double quad corrector for the mode 10 */
>                     *delta_lo++ = delta_m10_lo[n][j];
>                     *delta_hi++ = delta_m10_lo[n][i];
>                 }
>             }
>         } else {
>             quads = 0 - quads;
> 
>             for (j = 0; j < quads; j++) {
>                 for (i = 0; i < quads; i++) {
>                     /* concatenate two dyads to form a four-byte corrector (quad) */
> #ifdef HAVE_BIGENDIAN
>                     *quad_tab++ = (delta_tab[i] << 16) + delta_tab[j];
> #else
>                     *quad_tab++ = (delta_tab[j] << 16) + delta_tab[i];
> #endif
>                     *sel_tab++  = DELTA_QUAD; /* set the code selector = quad */
> 
>                     /* build the double quad corrector for the mode 10 */
>                     *delta_lo++ = delta_m10_lo[n][i];
>                     *delta_hi++ = delta_m10_lo[n][j];
>                 }
>             }
>         }//else

This is essentially duplicated code. Since it is in an av_cold area,
something like
swap = quads < 0;
quads = FFABS(quads);
if (swap) FFSWAP(..., delta_lo, delta_hi);
for (j = 0; j < quads; j++) {
    for (i = 0; i < quads; i++) {
        if (HAVE_BIGENDIAN ^ swap)
            *quad_tab++ = (delta_tab[j] << 16) + delta_tab[i];
        else
            *quad_tab++ = (delta_tab[i] << 16) + delta_tab[j];
....

Though it also is more customary to have the outer loop use i and the
inner one j...

>     /*  chroma dimensions should be four-byte aligned twice:
>         because of 1/4 subsampling
>         because of 4x4 block processing */
>     chroma_width  = (((ctx->width  + 3) >> 2) + 3) & ~3;
>     chroma_height = (((ctx->height + 3) >> 2) + 3) & ~3;

Huh?
FFALIGN(ctx->width, 16) >> 2;
should do the same?

>     /* setup output and reference pointers */
>     dst = &plane->pixels[ctx->buf_sel][(cell->ypos << 2) * plane->pitch + (cell->xpos << 2)];
>     /* reference block = prev_frame(cell_xpos + mv_x, cell_ypos + mv_y) */
>     mv_y = cell->mv_ptr[0];
>     mv_x = cell->mv_ptr[1];
>     offset = ((cell->ypos << 2) + mv_y) * plane->pitch + (cell->xpos << 2) + mv_x;
>     src = &plane->pixels[ctx->buf_sel ^ 1][offset];

Hm. I'd say
offset = (cell->ypos << 2) * plane->pitch + (cell->xpos << 2);
dst = plane->pixels[ctx->buf_sel] + offset;
...
offset += mv_y * plane->pitch + mv_x;
src = plane->pixels[ctx->buf_sel ^ 1] + offset;

is clearer (my point is about offset calculation, not so much &a[b] vs.
a + b

> /**
>  *  Average 4/8 pixels at once without rounding using softSIMD
>  */
> #define AVG_32(dst, src, ref)   AV_WN32((dst), ((AV_RN32(src) + AV_RN32(ref)) >> 1) & 0x7F7F7F7F)
> #define AVG_64(dst, src, ref)   AV_WN64((dst), ((AV_RN64(src) + AV_RN64(ref)) >> 1) & 0x7F7F7F7F7F7F7F7F)

Are all of src, dst, ref unaligned in general? If not, you should be
using casts instead of AV_RN*

> /**
>  *  Requant a 8 pixel line by duplicating each even pixel as follows:
>  *  ABCDEFGH -> AACCEEGG
>  */
> #ifdef HAVE_BIGENDIAN
>     #define REQUANT_64(hi, lo) {\
>         (hi)  = (hi) & 0xFF00FF00;\
>         (hi) |= (hi) >> 8;\
>         (lo)  = (lo) & 0xFF00FF00;\
>         (lo) |= (lo) >> 8;\
>     }
> #else
>     #define REQUANT_64(hi, lo) {\
>         (hi)  = (hi) & 0x00FF00FF;\
>         (hi) |= (hi) << 8;\
>         (lo)  = (lo) & 0x00FF00FF;\
>         (lo) |= (lo) << 8;\
>     }
> #endif

These do the same thing to hi and lo independently, and each of these are 32 bits.
I think this would be better as
static inline uint32_t requant(uint32_t a) {
#if HAVE_BIGENDIAN
    a &= 0xFF00FF00;
    a |= a >> 8;
#else
    a &= 0x00FF00FF;
    a |= a << 8;
#endif
   return a;
}

That is of course unless it makes more sense to get rid of the lo/hi
split very early on and just use uint64_t - that depends on how much the
complier would mess that up on 32 bit architectures I think, for what I
can tell it should work a lot better on 64 bit architectures.

>     if (mode == 1 || mode == 4) {
>         code        = ctx->alt_quant[vq_index];
>         prim_indx   = (code >> 4)  + ctx->cb_offset;
>         second_indx = (code & 0xF) + ctx->cb_offset;
> 
>         assert(prim_indx <= 23 && second_indx <= 23);
> 
>         prim_delta   = &delta_tabs   [prim_indx]  [0];
>         prim_sel     = &selector_tabs[prim_indx]  [0];
>         second_delta = &delta_tabs   [second_indx][0];
>         second_sel   = &selector_tabs[second_indx][0];
>     } else {
>         vq_index += ctx->cb_offset;
>         assert(vq_index <= 23);
> 
>         prim_delta   = &delta_tabs   [vq_index][0];
>         prim_sel     = &selector_tabs[vq_index][0];
>         second_delta = prim_delta;
>         second_sel   = prim_sel;
>     }

Is it really impossible to trigger these assert with a specially crafted
input file?

>     /* FIXME: if (vq_index >= 8 && (mode == 0 || mode == 3 || mode == 10) [win32] */

It's unclear to me what that is supposed to mean.

>         zoom_fac       =  mode >= 3;
>         line_offset    = (mode >= 3) ? row_offset : 0;

Uh,
line_offset = zoom_fac ? row_offset : 0;
?

>                         delta_tab = (line & 1) ? prim_delta : second_delta;
> 
>                         /* switch on code type: dyad, quad or RLE escape codes */
>                         switch ((line & 1) ? prim_sel[code] : second_sel[code]) {

Maybe you should be using
delta[2]
and
sel[2]
instead of prim_delta, second_delta, prim_sel, second_sel?
Would simplify this to
delta_tab = delta[line & 1];
switch (sel[line & 1][code]) {

>                         case DELTA_DYAD: /* apply VQ delta to two dyads (2+2 pixels) using softSIMD */
>                             if (((line & 1) ? prim_sel[*data_ptr] : second_sel[*data_ptr]) != DELTA_DYAD) {
>                                 av_log(avctx, AV_LOG_ERROR, "Mode %d: invalid VQ data!\n", mode);
>                                 return -1;
>                             }
>                             AV_WN16(dst + line_offset,     AV_RN16(ref    ) + delta_tab[*data_ptr++]);
>                             AV_WN16(dst + line_offset + 2, AV_RN16(ref + 2) + delta_tab[code]);
> 
>                             if (mode >= 3) {
>                                 /* odd lines are not coded but rather interpolated/replicated */
>                                 /* first line of the cell on the top of image? - replicate */
>                                 /* otherwise - interpolate */
>                                 if (is_top_of_cell && !cell->ypos) {
>                                     AV_WN32(dst, AV_RN32(dst + row_offset));
>                                 } else
>                                     AVG_32(dst, dst + row_offset, ref);
>                             }
>                             break;
> 
>                         case DELTA_QUAD: /* apply VQ delta to 4 pixels at once using softSIMD */
>                             AV_WN32(dst + line_offset, AV_RN32(ref) + delta_tab[code]);
> 
>                             if (mode >= 3) {
>                                 if (is_top_of_cell && !cell->ypos) {
>                                     AV_WN32(dst, AV_RN32(dst + row_offset));
>                                 } else
>                                     AVG_32(dst, dst + row_offset, ref);
>                             }
>                             break;

Except for the value written to dst + line_offset and the sel check,
these seem completely identical.

> 
>                         case RLE_ESC_FF: /* apply null delta to all lines up to the 2nd line */
>                             if (line) ERR_BAD_RLE(avctx, mode);
>                             copy_block4(dst, ref, row_offset, row_offset, 2 << zoom_fac);
>                             num_lines = 2;
>                             break;
> 
>                         case RLE_ESC_FE: /* apply null delta to all lines up to the 3rd line */
>                             if (line >= 2) ERR_BAD_RLE(avctx, mode);
>                             copy_block4(dst, ref, row_offset, row_offset, (3 - line) << zoom_fac);
>                             num_lines = 3 - line;
>                             break;
> 
>                         case RLE_ESC_FD: /* apply null delta to all remaining lines of this block */
>                             copy_block4(dst, ref, row_offset, row_offset, (4 - line) << zoom_fac);
>                             num_lines = 4 - line; /* go to process next block */
>                             break;

If you calculate num_lines first, these all do the same copy_block4

>                             return(-1);

Very inconsistent style.

>                                     fill_64(ref, ref_lo, ref_hi, (4 - line) << 1, row_offset);
>                                 num_lines = 4 - line; /* go to process next block */

Seems like calculating num_lines _first_ makes more sense almost
everywhere.

>                                 if (mode == 10) {
>                                     /* apply two 32bit deltas to two consecutive lines */
>                                     ref_lo = delta_lo[*data_ptr++];
>                                     ref_hi = delta_lo[code];
>                                     AV_WN32(dst                 , AV_RN32(dst                 ) + ref_lo);
>                                     AV_WN32(dst + 4             , AV_RN32(dst + 4             ) + ref_hi);
>                                     AV_WN32(dst + row_offset    , AV_RN32(dst + row_offset    ) + ref_lo);
>                                     AV_WN32(dst + row_offset + 4, AV_RN32(dst + row_offset + 4) + ref_hi);
>                                 } else {
>                                     /* apply two 16bit deltas to two consecutive lines */
>                                     ref_lo = delta_tab[*data_ptr++];
>                                     ref_hi = delta_tab[code];
>                                     AV_WN16(dst                 , AV_RN16(dst                 ) + ref_lo);
>                                     AV_WN16(dst + 2             , AV_RN16(dst + 2             ) + ref_hi);
>                                     AV_WN16(dst + row_offset    , AV_RN16(dst + row_offset    ) + ref_lo);
>                                     AV_WN16(dst + row_offset + 2, AV_RN16(dst + row_offset + 2) + ref_hi);
>                                 }
>                                 break;
> 
>                             case DELTA_QUAD:
>                                 if (mode == 10) {
>                                     /* apply the same 64bit delta to two consecutive lines */
>                                     ref_lo = delta_lo[code];
>                                     ref_hi = delta_hi[code];
>                                     AV_WN32(dst                 , AV_RN32(dst                 ) + ref_lo);
>                                     AV_WN32(dst + 4             , AV_RN32(dst + 4             ) + ref_hi);
>                                     AV_WN32(dst + row_offset    , AV_RN32(dst + row_offset    ) + ref_lo);
>                                     AV_WN32(dst + row_offset + 4, AV_RN32(dst + row_offset + 4) + ref_hi);
>                                 } else {
>                                     /* apply the same 32bit delta to two consecutive lines */
>                                     ref_lo = delta_tab[code];
>                                     AV_WN32(dst             , AV_RN32(dst             ) + ref_lo);
>                                     AV_WN32(dst + row_offset, AV_RN32(dst + row_offset) + ref_lo);
>                                 }

At least for mode == 10 these could be easily merged...

>     if (ctx->data_size > buf_size) {
>         av_log(avctx, AV_LOG_ERROR, "Frame data too short! Needed: %d, supplied: %d!\n",
>                ctx->data_size, buf_size);
>         return -1;
>     }

Just setting ctx->data_size = buf_size seems more user-friendly to me.
Also makes it easier to test the code has sufficient boundary-checks.

>     buf_ptr += 3; // skip reserved byte and checksum
> 
>     /* check frame dimensions */
>     height = bytestream_get_le16(&buf_ptr);
>     width  = bytestream_get_le16(&buf_ptr);
>     y_offset = bytestream_get_le32(&buf_ptr);
>     v_offset = bytestream_get_le32(&buf_ptr);
>     u_offset = bytestream_get_le32(&buf_ptr);

Which I suspect it doesn't these alone are already more than the 8 bytes
padding (I may just have missed the check though).

>         /* convert four pixels at once using softSIMD */
>         for (x = 0; x < plane->width >> 2; x++)
>             *dst32++ = (*src32++ & 0x7F7F7F7F) << 1;
> 
>         for (x <<= 2; x < plane->width; x++)
>             dst[x] = (src[x] & 0x7F) << 1;

The & 0x7F is useless.

>     res = decode_frame_headers(ctx, avctx, buf, buf_size);
>     if (res < 0)
>         return -1;
>     if (res)
>         return 0; /* skip sync(null) frames */

Uh, if you return 0 that means "none of the passed data was processed",
which seems like exactly the wrong thing for a null frame.
Also if you return >= 0 I think you are supposed to set data_size.