[FFmpeg-devel] [PATCH v3 2/2] Newtek SpeedHQ decoder.

Mon Jan 9 19:30:52 EET 2017

On 1/8/17, Steinar H. Gunderson <steinar+ffmpeg at gunderson.no> wrote:
> + * for the longest (10-bit) codes.
> + */
> +#define ALPHA_VLC_BITS 5
> +
> +typedef struct SHQContext {
> +    AVCodecContext *avctx;
> +    BlockDSPContext bdsp;
> +    IDCTDSPContext idsp;
> +    ScanTable intra_scantable;
> +    int quant_matrix[64];
> +    enum { SHQ_SUBSAMPLING_420, SHQ_SUBSAMPLING_422, SHQ_SUBSAMPLING_444 }
> +        subsampling;
> +    enum { SHQ_NO_ALPHA, SHQ_RLE_ALPHA, SHQ_DCT_ALPHA } alpha_type;
> +} SHQContext;
> +
> +
> +/* AC codes: Very similar but not identical to MPEG-2. */
> +static uint16_t speedhq_vlc[123][2] = {

Can this be uint8_t too?

> +    {0x02, 2}, {0x06, 3}, {0x07, 4}, {0x1c, 5},
> +    {0x1d, 5}, {0x05, 6}, {0x04, 6}, {0x7b, 7},
> +    {0x7c, 7}, {0x23, 8}, {0x22, 8}, {0xfa, 8},
> +    {0xfb, 8}, {0xfe, 8}, {0xff, 8}, {0x1f,14},
> +    {0x1e,14}, {0x1d,14}, {0x1c,14}, {0x1b,14},
> +    {0x1a,14}, {0x19,14}, {0x18,14}, {0x17,14},
> +    {0x16,14}, {0x15,14}, {0x14,14}, {0x13,14},
> +    {0x12,14}, {0x11,14}, {0x10,14}, {0x18,15},
> +    {0x17,15}, {0x16,15}, {0x15,15}, {0x14,15},
> +    {0x13,15}, {0x12,15}, {0x11,15}, {0x10,15},

[...]

> +    speedhq_run,
> +    speedhq_level,
> +};
> +
> +/* NOTE: The first element is always 16, unscaled. */
> +static const uint16_t unscaled_quant_matrix[64] = {

This can be uint8_t

> +    16, 16, 19, 22, 26, 27, 29, 34,
> +    16, 16, 22, 24, 27, 29, 34, 37,
> +    19, 22, 26, 27, 29, 34, 34, 38,
> +    22, 22, 26, 27, 29, 34, 37, 40,
> +    22, 26, 27, 29, 32, 35, 40, 48,
> +    26, 27, 29, 32, 35, 40, 48, 58,
> +    26, 27, 29, 34, 38, 46, 56, 69,
> +    27, 29, 35, 38, 46, 56, 69, 83
> +};
> +
> +static uint8_t ff_speedhq_static_rl_table_store[2][2*MAX_RUN + MAX_LEVEL +
> 3];
> +
> +static VLC ff_dc_lum_vlc_le;
> +static VLC ff_dc_chroma_vlc_le;
> +static VLC ff_dc_alpha_run_vlc_le;
> +static VLC ff_dc_alpha_level_vlc_le;
> +
> +static inline int decode_dc_le(GetBitContext *gb, int component)
> +{
> +    int code, diff;
> +
> +    if (component == 0 || component == 3) {
> +        code = get_vlc2(gb, ff_dc_lum_vlc_le.table, DC_VLC_BITS, 2);
> +    } else {
> +        code = get_vlc2(gb, ff_dc_chroma_vlc_le.table, DC_VLC_BITS, 2);
> +    }
> +    if (code < 0) {
> +        av_log(NULL, AV_LOG_ERROR, "invalid dc code at\n");
> +        return 0xffff;

Why this specific return value? I suppose decoding other blocks still continue?

> +    }
> +    if (!code) {
> +        diff = 0;
> +    } else {
> +        diff = get_xbits_le(gb, code);
> +    }
> +    return diff;
> +}
> +
> +static inline int decode_alpha_block(const SHQContext *s, GetBitContext
> *gb, uint8_t last_alpha[16], uint8_t *dest, int linesize)
> +{
> +    uint8_t block[128];
> +    int i = 0, x, y;
> +
> +    memset(block, 0, sizeof(block));
> +
> +    {
> +        OPEN_READER(re, gb);
> +
> +        for ( ;; ) {
> +            int run, level;
> +
> +            UPDATE_CACHE_LE(re, gb);
> +            GET_VLC(run, re, gb, ff_dc_alpha_run_vlc_le.table,
> ALPHA_VLC_BITS, 2);
> +
> +            if (run == 128) break;
> +            i += run;
> +            if (i >= 128)
> +                return AVERROR_INVALIDDATA;
> +
> +            UPDATE_CACHE_LE(re, gb);
> +            GET_VLC(level, re, gb, ff_dc_alpha_level_vlc_le.table,
> ALPHA_VLC_BITS, 2);
> +            block[i++] = level;
> +        }
> +
> +        CLOSE_READER(re, gb);
> +    }
> +
> +    for (y = 0; y < 8; y++) {
> +        for (x = 0; x < 16; x++) {
> +            last_alpha[x] -= block[y * 16 + x];
> +        }
> +        memcpy(dest, last_alpha, 16);
> +        dest += linesize;
> +    }
> +
> +    return 0;
> +}
> +
> +static inline int decode_dct_block(const SHQContext *s, GetBitContext *gb,
> int last_dc[4], int component, uint8_t *dest, int linesize)
> +{
> +    const int *quant_matrix = s->quant_matrix;
> +    const uint8_t *scantable = s->intra_scantable.permutated;
> +    int16_t block[64];
> +    int dc_offset;
> +
> +    s->bdsp.clear_block(block);
> +
> +    dc_offset = decode_dc_le(gb, component);
> +    last_dc[component] -= dc_offset;  /* Note: Opposite of most codecs. */
> +    block[scantable[0]] = last_dc[component];  /* quant_matrix[0] is always
> 16. */
> +
> +    /* Read AC coefficients. */
> +    {
> +        int i = 0;
> +        OPEN_READER(re, gb);
> +        for ( ;; ) {
> +            int level, run;
> +            UPDATE_CACHE_LE(re, gb);
> +            GET_RL_VLC(level, run, re, gb, ff_rl_speedhq.rl_vlc[0],
> +                       TEX_VLC_BITS, 2, 0);
> +            if (level == 127) {
> +                break;
> +            } else if (level) {
> +                i += run;
> +                if (i > MAX_INDEX)
> +                    return AVERROR_INVALIDDATA;
> +                /* If next bit is 1, level = -level */
> +                level = (level ^ SHOW_SBITS(re, gb, 1)) -
> +                        SHOW_SBITS(re, gb, 1);
> +                LAST_SKIP_BITS(re, gb, 1);
> +            } else {
> +                /* Escape. */
> +#if MIN_CACHE_BITS < 6 + 6 + 12
> +#error MIN_CACHE_BITS is too small for the escape code, add UPDATE_CACHE
> +#endif
> +                run = SHOW_UBITS(re, gb, 6) + 1;
> +                SKIP_BITS(re, gb, 6);
> +                level = SHOW_UBITS(re, gb, 12) - 2048;
> +                LAST_SKIP_BITS(re, gb, 12);
> +
> +                i += run;
> +                if (i > MAX_INDEX)
> +                    return AVERROR_INVALIDDATA;
> +            }
> +
> +            block[scantable[i]] = (level * quant_matrix[i]) >> 4;
> +        }
> +        CLOSE_READER(re, gb);
> +    }
> +
> +    s->idsp.idct_put(dest, linesize, block);
> +
> +    return 0;
> +}
> +
> +static int decode_speedhq_field(const SHQContext *s, const uint8_t *buf,
> int buf_size, AVFrame *frame, int field_number, int start, int end, int
> line_stride)
> +{
> +    int ret, slice_number, slice_offsets[5];
> +    int linesize_y  = frame->linesize[0] * line_stride;
> +    int linesize_cb = frame->linesize[1] * line_stride;
> +    int linesize_cr = frame->linesize[2] * line_stride;
> +    int linesize_a;
> +
> +    if (s->alpha_type != SHQ_NO_ALPHA)
> +        linesize_a = frame->linesize[3] * line_stride;
> +
> +    if (end < start || end - start < 3 || end > buf_size)
> +        return AVERROR_INVALIDDATA;
> +
> +    slice_offsets[0] = start;
> +    slice_offsets[4] = end;
> +    for (slice_number = 1; slice_number < 4; slice_number++) {
> +        uint32_t last_offset, slice_len;
> +
> +        last_offset = slice_offsets[slice_number - 1];
> +        slice_len = AV_RL24(buf + last_offset);
> +        slice_offsets[slice_number] = last_offset + slice_len;
> +
> +        if (slice_len < 3 || slice_offsets[slice_number] > end - 3)
> +            return AVERROR_INVALIDDATA;
> +    }
> +
> +    for (slice_number = 0; slice_number < 4; slice_number++) {
> +        GetBitContext gb;
> +        uint32_t slice_begin, slice_end;
> +        int x, y;
> +
> +        slice_begin = slice_offsets[slice_number];
> +        slice_end = slice_offsets[slice_number + 1];
> +
> +        if ((ret = init_get_bits8(&gb, buf + slice_begin + 3, slice_end -
> slice_begin - 3)) < 0)
> +            return ret;
> +
> +        for (y = slice_number * 16 * line_stride; y < frame->height; y +=
> line_stride * 64) {
> +            uint8_t *dest_y, *dest_cb, *dest_cr, *dest_a;
> +            int last_dc[4] = { 1024, 1024, 1024, 1024 };
> +            uint8_t last_alpha[16];
> +
> +            memset(last_alpha, 255, sizeof(last_alpha));
> +
> +            dest_y = frame->data[0] + frame->linesize[0] * (y +
> field_number);
> +            if (s->subsampling == SHQ_SUBSAMPLING_420) {
> +                dest_cb = frame->data[1] + frame->linesize[1] * (y/2 +
> field_number);
> +                dest_cr = frame->data[2] + frame->linesize[2] * (y/2 +
> field_number);
> +            } else {
> +                dest_cb = frame->data[1] + frame->linesize[1] * (y +
> field_number);
> +                dest_cr = frame->data[2] + frame->linesize[2] * (y +
> field_number);
> +            }
> +            if (s->alpha_type != SHQ_NO_ALPHA) {
> +                dest_a = frame->data[3] + frame->linesize[3] * (y +
> field_number);
> +            }
> +
> +            for (x = 0; x < frame->width; x += 16) {
> +                /* Decode the four luma blocks. */
> +                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y,
> linesize_y)) < 0)
> +                    return ret;
> +                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8,
> linesize_y)) < 0)
> +                    return ret;
> +                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8
> * linesize_y, linesize_y)) < 0)
> +                    return ret;
> +                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8
> * linesize_y + 8, linesize_y)) < 0)
> +                    return ret;
> +
> +                /*
> +                 * Decode the first chroma block. For 4:2:0, this is the
> only one;
> +                 * for 4:2:2, it's the top block; for 4:4:4, it's the
> top-left block.
> +                 */
> +                if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb,
> linesize_cb)) < 0)
> +                    return ret;
> +                if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr,
> linesize_cr)) < 0)
> +                    return ret;
> +
> +                if (s->subsampling != SHQ_SUBSAMPLING_420) {
> +                    /* For 4:2:2, this is the bottom block; for 4:4:4, it's
> the bottom-left block. */
> +                    if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb
> + 8 * linesize_cb, linesize_cb)) < 0)
> +                        return ret;
> +                    if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr
> + 8 * linesize_cr, linesize_cr)) < 0)
> +                        return ret;
> +
> +                    if (s->subsampling == SHQ_SUBSAMPLING_444) {
> +                        /* Top-right and bottom-right blocks. */
> +                        if ((ret = decode_dct_block(s, &gb, last_dc, 1,
> dest_cb + 8, linesize_cb)) < 0)
> +                            return ret;
> +                        if ((ret = decode_dct_block(s, &gb, last_dc, 2,
> dest_cr + 8, linesize_cr)) < 0)
> +                            return ret;
> +                        if ((ret = decode_dct_block(s, &gb, last_dc, 1,
> dest_cb + 8 * linesize_cb + 8, linesize_cb)) < 0)
> +                            return ret;
> +                        if ((ret = decode_dct_block(s, &gb, last_dc, 2,
> dest_cr + 8 * linesize_cr + 8, linesize_cr)) < 0)
> +                            return ret;
> +
> +                        dest_cb += 8;
> +                        dest_cr += 8;
> +                    }
> +                }
> +                dest_y += 16;
> +                dest_cb += 8;
> +                dest_cr += 8;
> +
> +                if (s->alpha_type == SHQ_RLE_ALPHA) {
> +                    /* Alpha coded using 16x8 RLE blocks. */
> +                    if ((ret = decode_alpha_block(s, &gb, last_alpha,
> dest_a, linesize_a)) < 0)
> +                        return ret;
> +                    if ((ret = decode_alpha_block(s, &gb, last_alpha,
> dest_a + 8 * linesize_a, linesize_a)) < 0)
> +                        return ret;
> +                    dest_a += 16;
> +                } else if (s->alpha_type == SHQ_DCT_ALPHA) {
> +                    /* Alpha encoded exactly like luma. */
> +                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a,
> linesize_a)) < 0)
> +                        return ret;
> +                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a
> + 8, linesize_a)) < 0)
> +                        return ret;
> +                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a
> + 8 * linesize_a, linesize_a)) < 0)
> +                        return ret;
> +                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a
> + 8 * linesize_a + 8, linesize_a)) < 0)
> +                        return ret;
> +                    dest_a += 16;
> +                }
> +            }
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static void compute_quant_matrix(int *output, int qscale)
> +{
> +    int i;
> +    for (i = 0; i < 64; i++) output[i] = unscaled_quant_matrix[i] * qscale;
> +}
> +
> +static int speedhq_decode_frame(AVCodecContext *avctx,
> +                                void *data, int *got_frame,
> +                                AVPacket *avpkt)
> +{
> +    SHQContext * const s = avctx->priv_data;
> +    const uint8_t *buf   = avpkt->data;
> +    int buf_size         = avpkt->size;
> +    AVFrame *frame       = data;
> +    uint8_t quality;
> +    uint32_t second_field_offset;
> +    int ret;
> +
> +    if (buf_size < 4)
> +        return AVERROR_INVALIDDATA;
> +
> +    quality = buf[0];
> +    if (quality >= 100) {
> +        return AVERROR_INVALIDDATA;
> +    }
> +
> +    compute_quant_matrix(s->quant_matrix, 100 - quality);
> +
> +    second_field_offset = AV_RL24(buf + 1);
> +    if (second_field_offset >= buf_size - 3) {
> +        return AVERROR_INVALIDDATA;
> +    }
> +
> +    avctx->coded_width = FFALIGN(avctx->width, 16);
> +    avctx->coded_height = FFALIGN(avctx->height, 16);
> +
> +    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
> +        return ret;
> +    }
> +    frame->key_frame = 1;
> +
> +    if (second_field_offset == 4) {
> +        /*
> +         * Overlapping first and second fields is used to signal
> +         * encoding only a single field (the second field then comes
> +         * as a separate, later frame).
> +         */
> +        frame->height >>= 1;
> +        if ((ret = decode_speedhq_field(s, buf, buf_size, frame, 0, 4,
> buf_size, 1)) < 0)
> +            return ret;
> +    } else {
> +        if ((ret = decode_speedhq_field(s, buf, buf_size, frame, 0, 4,
> second_field_offset, 2)) < 0)
> +            return ret;
> +        if ((ret = decode_speedhq_field(s, buf, buf_size, frame, 1,
> second_field_offset, buf_size, 2)) < 0)
> +            return ret;
> +    }
> +
> +    *got_frame = 1;
> +    return buf_size;
> +}
> +
> +/*
> + * Alpha VLC. Run and level are independently coded, and would be
> + * outside the default limits for MAX_RUN/MAX_LEVEL, so we don't
> + * bother with combining them into one table.
> + */
> +static av_cold void compute_alpha_vlcs(void)
> +{
> +    uint16_t run_code[129], level_code[256];
> +    uint8_t run_bits[129], level_bits[256];
> +    int run, level;
> +
> +    for (run = 0; run < 128; run++) {
> +        if (!run) {
> +            /* 0 -> 0. */
> +            run_code[run] = 0;
> +            run_bits[run] = 1;
> +        } else if (run <= 4) {
> +            /* 10xx -> xx plus 1. */
> +            run_code[run] = ((run - 1) << 2) | 1;
> +            run_bits[run] = 4;
> +        } else {
> +            /* 111xxxxxxx -> xxxxxxxx. */
> +            run_code[run] = (run << 3) | 7;
> +            run_bits[run] = 10;
> +        }
> +    }
> +
> +    /* 110 -> EOB. */
> +    run_code[128] = 3;
> +    run_bits[128] = 3;
> +
> +    INIT_LE_VLC_STATIC(&ff_dc_alpha_run_vlc_le, ALPHA_VLC_BITS, 129,
> +                       run_bits, 1, 1,
> +                       run_code, 2, 2, 160);
> +
> +    for (level = 0; level < 256; level++) {
> +        int8_t signed_level = (int8_t)level;
> +        int abs_signed_level = abs(signed_level);
> +        int sign = (signed_level < 0) ? 1 : 0;
> +
> +        if (abs_signed_level == 1) {
> +            /* 1s -> -1 or +1 (depending on sign bit). */
> +            level_code[level] = (sign << 1) | 1;
> +            level_bits[level] = 2;
> +        } else if (abs_signed_level >= 2 && abs_signed_level <= 5) {
> +            /* 01sxx -> xx plus 2 (2..5 or -2..-5, depending on sign bit).
> */
> +            level_code[level] = ((abs_signed_level - 2) << 3) | (sign << 2)
> | 2;
> +            level_bits[level] = 5;
> +        } else {
> +            /*
> +             * 00xxxxxxxx -> xxxxxxxx, in two's complement. 0 is
> technically an
> +             * illegal code (that would be encoded by increasing run), but
> it
> +             * doesn't hurt and simplifies indexing.
> +             */
> +            level_code[level] = level << 2;
> +            level_bits[level] = 10;
> +        }
> +    }
> +
> +    INIT_LE_VLC_STATIC(&ff_dc_alpha_level_vlc_le, ALPHA_VLC_BITS, 256,
> +                       level_bits, 1, 1,
> +                       level_code, 2, 2, 288);
> +}
> +
> +static uint32_t reverse(uint32_t num, int bits)
> +{
> +    return bitswap_32(num) >> (32 - bits);
> +}
> +
> +static void reverse_code(const uint16_t *code, const uint8_t *bits,
> +                         uint16_t *reversed_code, int num_entries)
> +{
> +    int i;
> +    for (i = 0; i < num_entries; i++) {
> +        reversed_code[i] = reverse(code[i], bits[i]);
> +    }
> +}
> +
> +static av_cold int speedhq_decode_init(AVCodecContext *avctx)
> +{
> +    static int done = 0;
> +    uint16_t ff_mpeg12_vlc_dc_lum_code_reversed[12];
> +    uint16_t ff_mpeg12_vlc_dc_chroma_code_reversed[12];
> +    SHQContext * const s = avctx->priv_data;
> +
> +    s->avctx = avctx;
> +
> +    if (!done) {
> +        int i;
> +
> +        /* Exactly the same as MPEG-2, except little-endian. */
> +        reverse_code(ff_mpeg12_vlc_dc_lum_code,
> +                     ff_mpeg12_vlc_dc_lum_bits,
> +                     ff_mpeg12_vlc_dc_lum_code_reversed,
> +                     12);
> +        INIT_LE_VLC_STATIC(&ff_dc_lum_vlc_le, DC_VLC_BITS, 12,
> +                           ff_mpeg12_vlc_dc_lum_bits, 1, 1,
> +                           ff_mpeg12_vlc_dc_lum_code_reversed, 2, 2, 512);
> +        reverse_code(ff_mpeg12_vlc_dc_chroma_code,
> +                     ff_mpeg12_vlc_dc_chroma_bits,
> +                     ff_mpeg12_vlc_dc_chroma_code_reversed,
> +                     12);

What about "storing" reverse codes in source code, so this step is not required?