[FFmpeg-devel] [RFC v2 3/3] daaladec: Implement a native Daala decoder

Tue Dec 29 16:55:13 CET 2015

On 29.12.2015 03:12, Rostislav Pehlivanov wrote:
> Hmm, kinda odd.
> I've attached a v2 of the RFC Patch which does this:
> 1. Fixed some typos.
> 2. Fixed some missing newlines at the end of daalatab*
> 3. Moves the pix_fmt finding function inside the main decoder file.
> 
> Try building now.

First, this fails building with --enable-shared, because daala_find_p_format
is not static.

Second, this decoder crashes, hangs and triggers lots of undefined behavior. :-/

Third, the decoder seems to return only the first frame, repeatedly,
instead of the following frames.

More details are interleaved in the diff below.

> diff --git a/libavcodec/daala.h b/libavcodec/daala.h
> new file mode 100644
> index 0000000..535e78f
> --- /dev/null
> +++ b/libavcodec/daala.h
> @@ -0,0 +1,78 @@
[...]
> +typedef struct DaalaBitstreamHeader {
> +    uint8_t key_frame;
> +    uint8_t bipred;
> +    uint8_t ref_num;
> +    uint8_t act_mask;
> +    uint8_t qm;
> +    uint8_t haar;
> +    uint8_t golden;
> +    uint8_t pvq_qm[DAALA_MAX_PLANES][DAALA_QM_SIZE];
> +} DaalaBitstreamHeader;
> +
> +typedef struct DaalaSharedContext {
> +    DaalaBitstreamHeader h;
> +} DaalaSharedContext;

What is the point of having DaalaSharedContext in addition to DaalaBitstreamHeader?

> diff --git a/libavcodec/daala_entropy.h b/libavcodec/daala_entropy.h
> new file mode 100644
> index 0000000..3fdcaef
> --- /dev/null
> +++ b/libavcodec/daala_entropy.h
> @@ -0,0 +1,554 @@
[...]
> +/* Updates the generic exponential probability model */
> +static av_always_inline void daalaent_exp_model_update(DaalaCDF *c, int *ex, int x,
> +                                                       int xs, int id, int integrate)
> +{
> +    int i, xenc;
> +    ent_rng *cdf = &c->cdf[id*c->y];
> +    if (cdf[15] + c->inc > 32767) {
> +        for (i = 0; i < 16; i++)
> +            cdf[i] = (cdf[i] >> 1) + i + 1;
> +    }
> +    xenc = FFMIN(15, xs);
> +    for (i = xenc; i < 16; i++)
> +        cdf[i] += c->inc;
> +    x = FFMIN(x, 32767);
> +    *ex += ((x << 16) - *ex) >> integrate;

This subtraction can overflow.

[...]
> +/* "+derf | It was a hack for the screen coding wavelet tools." */

A rather bad hack...

> +static inline int daalaent_decode_unary(DaalaEntropy *e)
> +{
> +    int rval = 0;
> +    while (!daalaent_decode_bits(e, 1))

This is an infinite loop that can hang the decoder.

> +        rval++;
> +    return rval;
> +}
[...]
> +/* Decodes quantized coefficients from  the bitsteam */
> +static inline void daalaent_decode_laplace_vector(DaalaEntropy *e, dctcoef *y,
> +                                                  int n, int k, dctcoef *curr,
> +                                                  const dctcoef *means)
> +{
> +    int i, exp_q8, mean_k_q8, mean_sum_ex_q8, sum_ex = 0, kn = k, ran_delta = 0;
> +    if (k <= 1) {
> +        daalaent_decode_laplace_delta(e, y, n, k, curr, means);
> +        return;
> +    }
> +    if (!k) {
> +        curr[DAALAENT_PVQ_COUNT] = DAALAENT_PVQ_NOVAL;
> +        curr[DAALAENT_PVQ_COUNT_EX] = DAALAENT_PVQ_NOVAL;
> +        curr[DAALAENT_PVQ_K] = 0;
> +        curr[DAALAENT_PVQ_SUM_EX] = 0;
> +        memset(y, 0, n*sizeof(dctcoef));
> +        return;
> +    }
> +    mean_k_q8 = means[DAALAENT_PVQ_K];
> +    mean_sum_ex_q8 = means[DAALAENT_PVQ_SUM_EX];
> +    if (mean_k_q8 < 1 << 23)
> +        exp_q8 = 256*mean_k_q8/(1 + mean_sum_ex_q8);

This multiplication can overflow.

> +    else
> +        exp_q8 = mean_k_q8/(1 + (mean_sum_ex_q8 >> 8));
> +    for (i = 0; i < n; i++) {
> +        int x, ex;
> +        if (!kn)
> +            break;
> +        if (kn <= 1 && i != n - 1) {
> +            daalaent_decode_laplace_delta(e, y + i, n - i, kn, curr, means);
> +            ran_delta = 1;
> +            i = n;
> +            break;
> +        }
> +        ex = (2*exp_q8*kn + (n - i))/(2*(n - i));
> +        if (ex > kn*256)
> +            ex = kn*256;
> +        sum_ex += (2*256*kn + (n - i))/(2*(n - i));

All these multiplications involving kn can overflow.

> +        if (i != n - 1)
> +            x = daalaent_decode_laplace_pvq(e, ex, kn);
> +        else
> +            x = kn;
> +        y[i] = x*daalaent_cphase(e, x);
> +        kn -= abs(x);
> +    }
> +    memset(&y[i], 0, (n - i)*sizeof(dctcoef)); /* Zero the rest */
> +    if (!ran_delta) {
> +        curr[DAALAENT_PVQ_COUNT] = DAALAENT_PVQ_NOVAL;
> +        curr[DAALAENT_PVQ_COUNT_EX] = DAALAENT_PVQ_NOVAL;
> +    }
> +    curr[DAALAENT_PVQ_K] = k - kn;
> +    curr[DAALAENT_PVQ_SUM_EX] = sum_ex;
> +}
> +
> +/* Expectation value is in Q16 */
> +static inline int daalaent_decode_generic(DaalaEntropy *e, DaalaCDF *c, int *ex,
> +                                          int max, int integrate)
> +{
> +    int rval, lsb = 0, log_ex = daalaent_log_ex(*ex);
> +    const int shift = FFMAX(0, (log_ex - 5) >> 1);
> +    const int id = FFMIN(DAALAENT_MODEL_TAB - 1, log_ex);
> +    const int ms = (max + (1 << shift >> 1)) >> shift;
> +    int xs = (max == -1) ? 16 : FFMIN(ms + 1, 16);
> +    ent_rng *cdf = &c->cdf[id*c->y];
> +    if (!max)
> +        return 0;
> +    if ((xs = daalaent_decode_cdf(e, cdf, xs, 0, CDF_UNSCALED)) == 15) {
> +        int g = ((2*(*ex) >> 8) + (1 << shift >> 1)) >> shift;

This multiplication can overflow, causing g to become negative, e.g. -256.

> +        ent_win decay = FFMAX(2, FFMIN(254, 256*g/(g + 256)));

In that case this gives a division by zero crash.

> +static inline void daalaent_decode_init(DaalaEntropy *e, const uint8_t *buf,
> +                                        int buf_size)
> +{
> +    e->rbuf = buf;
> +    e->erbuf = buf + buf_size;

This doesn't check if buf_size is 0. In that case e->end_window is always 0,
causing daalaent_decode_bits to always return 0, leading to the infinite loop
in daalaent_decode_unary.

> +    e->buf = buf;
> +    e->ebuf = buf + buf_size;
> +    e->err = 0;
> +    e->diff  = 0;
> +    e->range = 32768;
> +    e->count = -15;
> +    e->eos_offset = 18 - DAALAENT_WSIZE;
> +    e->end_window = 0;
> +    e->end_window_size = 0;
> +    daalaent_fillup(e);
> +}
[...]
> diff --git a/libavcodec/daala_pvq.h b/libavcodec/daala_pvq.h
> new file mode 100644
> index 0000000..ea8a86b
> --- /dev/null
> +++ b/libavcodec/daala_pvq.h
> @@ -0,0 +1,369 @@
[...]
> +static inline void daalapvq_synth(dctcoef *xcoeff, dctcoef *ypulse, dctcoef *ref,
> +                                  int n, double gr, uint8_t ref_p, double gain,
> +                                  double theta, const int16_t *qmatrix,
> +                                  const int16_t *qmatrix_inv)
> +{
> +    int i, m, nn = n - ref_p, s = 0, yy = 0;
> +    double scale, r[DAALAPVQ_MAX_PART_SIZE], x[DAALAPVQ_MAX_PART_SIZE];
> +    if (ref_p) {
> +        for (i = 0; i < n; i++)
> +            r[i] = ref[i]*qmatrix[i]*DAALA_QM_SCALE_UNIT;

The 'ref[i]*qmatrix[i]' multiplication can overflow.

> +    }
> +    m = !ref_p ? 0 : daalapvq_householder_c(r, n, gr, &s);
> +    for (i = 0; i < nn; i++)
> +        yy += ypulse[i]*ypulse[i];
> +    scale = !yy ? 0 : gain/sqrt(yy);
> +    if (!ref_p) {
> +        for (i = 0; i < n; i++)
> +            xcoeff[i] = lrint((ypulse[i]*scale)*(qmatrix_inv[i]*DAALA_QM_INV_SCALE_UNIT));
> +    } else {
> +        scale *= sin(theta);
> +        for (i = 0; i < m; i++)
> +            x[i] = ypulse[i]*scale;
> +        x[m] = -s*gain*cos(theta);
> +        for (i = m; i < nn; i++)
> +            x[i+1] = ypulse[i]*scale;
> +        daalapvq_householder_a(x, r, n);
> +        for (i = 0; i < n; i++)
> +            xcoeff[i] = lrint(x[i]*qmatrix_inv[i]*DAALA_QM_INV_SCALE_UNIT);
> +    }
> +}
> +
> +static av_always_inline void daalapvq_adapt_shuffle(int *dst, int *src, int spd,
> +                                                    int idx, int mul)
> +{
> +    if (src[idx] < 1)
> +        return;
> +    dst[idx+0] += (mul*src[idx+0] - dst[idx+0]) >> spd;

This multiplication can overflow.

[...]
> +static inline void daalapvq_decode_vector(DaalaEntropy *e, DaalaPVQ *pvq,
> +                                          dctcoef *out, dctcoef *ref,
> +                                          const double beta,
> +                                          uint8_t key_frame, int p,
> +                                          uint8_t *skip_rest,
> +                                          uint8_t has_err, int band_idx,
> +                                          int qm_off, enum DaalaBsize bsize)
> +{
> +    int i, k;
> +    int qg = 0, skip = 0, itheta = (!!key_frame), has_ref = !key_frame;
> +    double qcg, gain, theta = 0.0f, gr = 0.0f, gain_off = 0.0f;
> +    dctcoef tmp[DAALAPVQ_MAX_PART_SIZE] = {0};
> +
> +    const int robust = has_err || key_frame;
> +    const int band_len = pvq->size[band_idx];
> +    const int16_t *qmatrix = &pvq->qmatrix[qm_off];
> +    const int16_t *qmatrix_inv = &pvq->qmatrix_inv[qm_off];
> +
> +    if (!skip_rest[(band_idx + 2) % 3]) {
> +        int iloc = (!!p)*DAALA_NBSIZES*DAALAPVQ_PARTITIONS_MAX + bsize*DAALAPVQ_PARTITIONS_MAX + band_idx;
> +        i = daalaent_decode_cdf_adapt(e, &pvq->pvqtheta_gain_cdf, iloc, 8 + 7*pvq->skip[band_idx]);
> +        if (!key_frame && i >= 10)
> +            i++;
> +        if (key_frame && i >= 8)
> +            i++;
> +        if (i >= 8) {
> +            i -= 8;
> +            skip_rest[0] = skip_rest[1] = skip_rest[2] = 1;
> +        }
> +        qg = i & 1;
> +        itheta = (i >> 1) - 1;
> +        has_ref = !(itheta == -1);
> +    }
> +    if (qg) {
> +        int *ex = pvq->pvqgain_ex[p][bsize] + band_idx, ex_tmp = *ex;
> +        DaalaCDF *mcdf = has_ref ? &pvq->pvqgain_ref_mcdf : &pvq->pvqgain_noref_mcdf;
> +        qg = 1 + daalaent_decode_generic(e, mcdf, &ex_tmp, -1, 2);
> +        *ex += ((qg << 16) - *ex) >> 2;

The left shift can overflow, also the subtraction, and the final addition, too.

[...]
> diff --git a/libavcodec/daaladec.c b/libavcodec/daaladec.c
> new file mode 100644
> index 0000000..3501a6b
> --- /dev/null
> +++ b/libavcodec/daaladec.c
> @@ -0,0 +1,804 @@
[...]
> +/* Get DC level */
> +static void get_haar_dc_sb(DaalaContext *s, HaarGradient *g, dctcoef *d, int x, int y,
> +                                  uint8_t p, uint8_t lim_pass)
> +{
> +    int q, q_dc;
> +    int xdec = s->fmt->dec[p][0];
> +    const int aw = s->width >> xdec;
> +    const int ln = DAALA_LOG_BSIZE_MAX - xdec;
> +    dctcoef dc_pred = 0, **dc_buf = s->haar_dc_buf[p];
> +    if (!s->quantizer[p])
> +        q_dc = 1;
> +    else
> +        q_dc = FFMAX(1, s->quantizer[p]*s->s.h.pvq_qm[p][daala_get_qm_idx(DAALA_NBSIZES - 1, 0)] >> 4);
> +    if (x > 0 && y > 0) {
> +        if (lim_pass) { /* ALERT: coeffs could change */
> +            dc_pred = 22*dc_buf[x-1][y-0] - 9*dc_buf[x-1][y-1] +
> +                      15*dc_buf[x+0][y-1] + 4*dc_buf[x+1][y-1];
> +        } else {
> +            dc_pred = 23*dc_buf[x-1][y-0] - 10*dc_buf[x-1][y-1] + 19*dc_buf[x-0][y-1];
> +        }

These two dc_pred calculations can overflow in various ways.

> +        dc_pred = (dc_pred + 16) >> 5;
> +    } else {
> +        dc_pred += x > 0 ? dc_buf[x-1][y-0] : 0;
> +        dc_pred += y > 0 ? dc_buf[x-0][y-1] : 0;
> +    }
> +    q = daalaent_decode_generic(&s->e, &s->haar_dc_mcdf[p], &s->haar_sb_ex[p], -1, 2);
> +    q *= daalaent_cphase(&s->e, q);
> +    q = q*q_dc + dc_pred;

This multiplication can overflow.

[...]
> +/* Haar block decoding and transform */
> +static void decode_block_haar(DaalaContext *s, int x, int y, int p, enum DaalaBsize bsize)
> +{
> +    int i, j, k, l, n = 1 << (bsize + 2);
> +    const int dx = x << bsize, dy = y << bsize;
> +    const int aw = s->width >> s->fmt->dec[p][0];
> +    const int boffset = (dy << 2)*aw + (dx << 2);
> +
> +    dctcoef tree[4][4];
> +    dctcoef pred[DAALA_BSIZE_MAX*DAALA_BSIZE_MAX];
> +    dctcoef tpred[DAALA_BSIZE_MAX*DAALA_BSIZE_MAX];
> +
> +    daala_calc_prediction(s, pred, s->dcoef[p], dx, dy, p, bsize);
> +    memcpy(tpred, pred, n*n*sizeof(dctcoef));
> +
> +    tree[0][0] = daalaent_decode_cdf_adapt(&s->e, &s->haar_bit_cdf, p, 16);
> +    if (tree[0][0] == 15)
> +        tree[0][0] += daalaent_decode_unary(&s->e);
> +
> +    if (tree[0][0] > 24) {
> +        s->e.err = 1;
> +        return;
> +    } else if (tree[0][0] > 1) {
> +        int tmp = daalaent_decode_bits(&s->e, tree[0][0] - 1);
> +        tree[0][0] = (1 << (tree[0][0] - 1)) | tmp;
> +    }
> +
> +    tree[1][1] = decode_haar_coeff_tree_split(s, tree[0][0],              3, 0);
> +    tree[0][1] = decode_haar_coeff_tree_split(s, tree[0][0] - tree[1][1], 4, 0);
> +    tree[1][0] = tree[0][0] - tree[1][1] - tree[0][1];
> +
> +    decode_tree_sum(s, pred, 1, 0, tree[0][1], bsize + 2, 0);
> +    decode_tree_sum(s, pred, 0, 1, tree[1][0], bsize + 2, 1);
> +    decode_tree_sum(s, pred, 1, 1, tree[1][1], bsize + 2, 2);
> +
> +    for (i = 0; i < n; i++) {
> +        for (j = (i == 0); j < n; j++)
> +            pred[i*n + j] *= daalaent_cphase(&s->e, pred[i*n + j]);
> +    }
> +    for (i = 0; i < 3; i++) {           /* Direction   */
> +        for (j = 0; j < bsize+2; j++) { /* Level       */
> +            int bo = (((i + 1) >> 1) << j)*n + (((i + 1) & 1) << j);
> +            int q = !s->quantizer[p] ? 1 : s->quantizer[p]*daala_haar_qm[i == 2][j] >> 4;
> +            for (k = 0; k < 1 << j; k++)
> +                for (l = 0; l < 1 << j; l++)
> +                    pred[bo + k*n + l] = q*pred[bo + k*n + l] + tpred[bo + k*n + l];

The q*pred[] multiplication can overflow.

[...]
> diff --git a/libavcodec/daaladsp.c b/libavcodec/daaladsp.c
> new file mode 100644
> index 0000000..ba03520
> --- /dev/null
> +++ b/libavcodec/daaladsp.c
> @@ -0,0 +1,2123 @@
[...]
> +static av_always_inline void idct_1D_4(pixel *x, int xstride, const pixel y[4])
> +{
> +    int t2h, t0 = y[0], t1 = y[1], t2 = y[2], t3 = y[3];
> +    t3 += (t1*18293 + 8192) >> 14;
> +    t1 -= (t3*21407 + 16384) >> 15;
> +    t3 += (t1*23013 + 16384) >> 15;
> +    t2 = t0 - t2;

These four lines can overflow.

> +    t2h = DAALA_DCT_RSHIFT(t2, 1);
> +    t0 -= t2h - DAALA_DCT_RSHIFT(t3, 1);
> +    t1 = t2h - t1;

These two, as well.

> +    *(x + 0*xstride) = (pixel)t0;
> +    *(x + 1*xstride) = (pixel)(t2 - t1);

This subtraction can overflow.

> +    *(x + 2*xstride) = (pixel)t1;
> +    *(x + 3*xstride) = (pixel)(t0 - t3);

This one, too.

> +}
> +
> +static av_always_inline void idct_1D_8(pixel *x, int xstride, const pixel y[16])
> +{
> +    int t1h, t4h, t6h, t0 = y[0], t1 = y[1], t2 = y[2], t3 = y[3], t4 = y[4];
> +    int t5 = y[5], t6 = y[6], t7 = y[7];
> +    t5 -= (t3*2485 + 4096) >> 13;
> +    t3 += (t5*18205 + 16384) >> 15;
> +    t5 -= (t3*2485 + 4096) >> 13;
> +    t7 -= (t1*3227 + 16384) >> 15;
> +    t1 += (t7*6393 + 16384) >> 15;
> +    t7 -= (t1*3227 + 16384) >> 15;
> +    t1 += t3;

These seven lines can overflow.

> +    t1h = DAALA_DCT_RSHIFT(t1, 1);
> +    t3 = t1h - t3;

This line, too.

> +    t5 += t7;
> +    t7 = DAALA_DCT_RSHIFT(t5, 1) - t7;
> +    t3 += (t5*7489 + 4096) >> 13;
> +    t5 -= (t3*11585 + 8192) >> 14;
> +    t3 -= (t5*19195 + 16384) >> 15;
> +    t6 += (t2*21895 + 16384) >> 15;
> +    t2 -= (t6*15137 + 8192) >> 14;
> +    t6 += (t2*21895 + 16384) >> 15;
> +    t0 += (t4*13573 + 16384) >> 15;
> +    t4 -= (t0*11585 + 8192) >> 14;
> +    t0 += (t4*13573 + 16384) >> 15;

These nine lines can also overflow.

> +    t4 = t2 - t4;
> +    t4h = DAALA_DCT_RSHIFT(t4, 1);
> +    t2 = t4h - t2;
> +    t6 = t0 - t6;
> +    t6h = DAALA_DCT_RSHIFT(t6, 1);
> +    t0 -= t6h;
> +    t7 = t6h - t7;
> +    t6 -= t7;
> +    t2 += DAALA_DCT_RSHIFT(t3, 1);
> +    t3 = t2 - t3;
> +    t5 += t4h;
> +    t4 -= t5;

This can overflow.

> +    t0 += t1h;
> +    t1 = t0 - t1;
> +    *(x + 0*xstride) = (pixel)t0;
> +    *(x + 1*xstride) = (pixel)t4;
> +    *(x + 2*xstride) = (pixel)t2;
> +    *(x + 3*xstride) = (pixel)t6;
> +    *(x + 4*xstride) = (pixel)t7;
> +    *(x + 5*xstride) = (pixel)t3;
> +    *(x + 6*xstride) = (pixel)t5;
> +    *(x + 7*xstride) = (pixel)t1;
> +}
> +
> +static av_always_inline void idct_1D_16(pixel *x, int xstride, const pixel y[16])
> +{
> +    int tfh, tbh, t1h, tdh, t8h, tch, tah, t2h;
> +    int t0 = y[0], t1 = y[1], t2 = y[2], t3 = y[3], t4 = y[4],  t5 = y[5];
> +    int t6 = y[6], t7 = y[7], t8 = y[8], t9 = y[9], ta = y[10], tb = y[11];
> +    int tc = y[12], td = y[13], te = y[14], tf = y[15];
> +    t1 += (tf*13573 + 16384) >> 15;
> +    tf -= (t1*11585 + 8192) >> 14;
> +    t1 += ((tf*13573 + 16384) >> 15)+t7;
> +    td -= (t3*10947 + 8192) >> 14;
> +    t3 += (td*15137 + 8192) >> 14;
> +    t5 += (tb*10947 + 8192) >> 14;
> +    tb -= (t5*15137 + 8192) >> 14;
> +    t5 += (tb*10947 + 8192) >> 14;
> +    td += t5 - ((t3*21895 + 16384) >> 15);

These nine lines can overflow.

> +    tf = t9 - tf;
> +    tb += t3;
> +    tfh = DAALA_DCT_RSHIFT(tf, 1);
> +    t9 -= tfh;
> +    tbh = DAALA_DCT_RSHIFT(tb, 1);
> +    t3 += tfh - tbh;
> +    t1h = DAALA_DCT_RSHIFT(t1, 1);
> +    t7 = t1h - t7 + tbh;
> +    tdh = DAALA_DCT_RSHIFT(td, 1);
> +    t5 += t1h - tdh;
> +    t9 = tdh - t9;
> +    td -= t9;
> +    tf = t3 - tf;
> +    t1 -= t5 + ((tf*20055 + 16384) >> 15);
> +    tf += (t1*23059 + 8192) >> 14;
> +    t1 -= (tf*5417 + 4096) >> 13;

These three lines, too.

> +    tb = t7 - tb;
> +    t9 += (t7*14101 + 8192) >> 14;

This line, too.

> +    t7 += (t9*3363 + 4096) >> 13;
> +    t9 -= (t7*12905 + 8192) >> 14;
> +    tb -= (td*4379 + 8192) >> 14;
> +    td += (tb*20435 + 8192) >> 14;
> +    tb -= (td*17515 + 16384) >> 15;
> +    t3 += (t5*851 + 4096) >> 13;
> +    t5 += (t3*14699 + 8192) >> 14;
> +    t3 -= (t5*1035 + 1024) >> 11;
> +    t6 -= (ta*7335 + 16384) >> 15;
> +    ta -= (t6*12873 + 8192) >> 14;
> +    te += (t2*2873 + 1024) >> 11;
> +    t2 += (te*9041 + 16384) >> 15;
> +    t6 = DAALA_DCT_RSHIFT(t2, 1) - t6 - ((ta*8593 + 8192) >> 14);
> +    te = DAALA_DCT_RSHIFT(ta, 1) - te + ((t2*2275 + 1024) >> 11);

These thirteen lines, too.

> +    t2 -= t6;
> +    ta -= te;
> +    t6 -= (ta*13573 + 16384) >> 15;
> +    ta += (t6*11585 + 8192) >> 14;
> +    t6 -= (ta*13573 + 16384) >> 15;
> +    tc += (t4*9147 + 4096) >> 13;
> +    t4 -= (tc*10703 + 8192) >> 14;
> +    tc += (t4*23013 + 16384) >> 15;

These six lines, too.

[...]
> +static av_always_inline void idct_1D_32(pixel *x, int xstride, const pixel y[32])
> +{
> +    int t0 = y[0],  tg = y[1],  t8 = y[2],  to = y[3],  t4 = y[4],  tk = y[5];
> +    int tc = y[6],  ts = y[7],  t2 = y[8],  ti = y[9],  ta = y[10], tq = y[11];
> +    int t6 = y[12], tm = y[13], te = y[14], tu = y[15], t1 = y[16], th = y[17];
> +    int t9 = y[18], tp = y[19], t5 = y[20], tl = y[21], td = y[22], tt = y[23];
> +    int t3 = y[24], tj = y[25], tb = y[26], tr = y[27], t7 = y[28], tn = y[29];
> +    int tf = y[30], tv = y[31];
> +    OD_IDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
> +               t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);

This can overflow.

[...]
> +static av_always_inline void idct_1D_64(pixel *x, int xstride, const pixel y[64])
> +{
> +    int t0 = y[0],  tw = y[1],  tg = y[2],  tM = y[3],  t8 = y[4],  tE = y[5];
> +    int to = y[6],  tU = y[7],  t4 = y[8],  tA = y[9],  tk = y[10], tQ = y[11];
> +    int tc = y[12], tI = y[13], ts = y[14], tY = y[15], t2 = y[16], ty = y[17];
> +    int ti = y[18], tO = y[19], ta = y[20], tG = y[21], tq = y[22], tW = y[23];
> +    int t6 = y[24], tC = y[25], tm = y[26], tS = y[27], te = y[28], tK = y[29];
> +    int tu = y[30], t_ = y[31], t1 = y[32], tx = y[33], th = y[34], tN = y[35];
> +    int t9 = y[36], tF = y[37], tp = y[38], tV = y[39], t5 = y[40], tB = y[41];
> +    int tl = y[42], tR = y[43], td = y[44], tJ = y[45], tt = y[46], tZ = y[47];
> +    int t3 = y[48], tz = y[49], tj = y[50], tP = y[51], tb = y[52], tH = y[53];
> +    int tr = y[54], tX = y[55], t7 = y[56], tD = y[57], tn = y[58], tT = y[59];
> +    int tf = y[60], tL = y[61], tv = y[62],  t = y[63];
> +    OD_IDCT_64(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
> +               t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx,
> +               th, tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP,
> +               tb, tH, tr, tX, t7, tD, tn, tT, tf, tL, tv, t);

This can overflow.
> +static void postfilter_4x4(pixel *dst, const pixel *src)
> +{
> +    int t[4];
> +    t[3] = src[0]-src[3];
> +    t[2] = src[1]-src[2];
> +    t[1] = src[1]-(t[2]>>1);

These two lines can overflow.

> +    t[0] = src[0]-(t[3]>>1);
> +    t[2] -= (t[3]*FILTER_PARAM_4_3+32)>>6;
> +    t[3] -= (t[2]*FILTER_PARAM_4_2+32)>>6;
> +    #if FILTER_PARAM_4_1 != 64
> +    t[3] = t[3]*(1 << 6)/FILTER_PARAM_4_1;
> +    #endif
> +    #if FILTER_PARAM_4_0 != 64
> +    t[2] = t[2]*(1 << 6)/FILTER_PARAM_4_0;
> +    #endif

These four multiplications can overflow.
[...]
> +/* Chroma from luma */
> +static void daala_cfl_resample(uint8_t *_dst, int dstride, const uint8_t *_src,
> +                               int istride, int xdec, int ydec, int bs, int chroma_bs)
> +{
> +    int i, j;
> +    const int n = 4 << bs;
> +    pixel *dst = (pixel *)_dst;
> +    pixel *src = (pixel *)_src;
> +    if (!chroma_bs && (xdec || ydec)) {
> +        if (xdec) {
> +            if (ydec) {
> +                daala_sf_ful_up(dst, dstride, src, istride, n, n, n);
> +                for (i = 0; i < 4; i++) {
> +                    for (j = 0; j < 4; j++) {
> +                        dst[i*dstride + j] = (daaladsp_cfl_scale[j][i]*dst[i*dstride + j] + 64) >> 7;

The daaladsp_cfl_scale * dst multiplication can overflow, as well.

Best regards,
Andreas