[FFmpeg-devel] [PATCH] VC1: merge idct8x8, coeff adjustments and put_pixels.

Mon Feb 21 14:23:09 CET 2011

On Mon, Feb 21, 2011 at 08:13:28AM -0500, Ronald S. Bultje wrote:
> Merging these functions allows merging some loops, which makes the
> results (particularly after SIMD optimizations) much faster.
> ---
>  libavcodec/ppc/vc1dsp_altivec.c |   66 +++++++++++++++++++++++++++++++++++++--
>  libavcodec/vc1.c                |   28 ++++++++++++++++-
>  libavcodec/vc1dec.c             |   58 +++++++++++++++++++---------------
>  libavcodec/vc1dsp.c             |   54 ++++++++++++++++++++++++++------
>  libavcodec/vc1dsp.h             |    5 ++-
>  5 files changed, 171 insertions(+), 40 deletions(-)
> 
> diff --git a/libavcodec/ppc/vc1dsp_altivec.c b/libavcodec/ppc/vc1dsp_altivec.c
> index 5a0dddb..16238f6 100644
> --- a/libavcodec/ppc/vc1dsp_altivec.c
> +++ b/libavcodec/ppc/vc1dsp_altivec.c
> @@ -130,7 +130,8 @@ do { \
>  
>  /** Do inverse transform on 8x8 block
>  */
> -static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
> +static void vc1_inv_trans_8x8_altivec(DCTELEM block[64],
> +                                      int sign, int rangered)
>  {
>      vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
>      vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
> @@ -144,7 +145,9 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
>      const vector unsigned int vec_2 = vec_splat_u32(2);
>      const vector  signed int vec_1s = vec_splat_s32(1);
>      const vector unsigned int vec_1 = vec_splat_u32(1);
> -
> +    const vector unsigned short rangered_shift = vec_splat_u16(1);
> +    const vector unsigned short unsigned_bias = vec_sl(vec_splat_u16(4),
> +                                                       vec_splat_u16(4));
>  
>      src0 = vec_ld(  0, block);
>      src1 = vec_ld( 16, block);
> @@ -214,6 +217,27 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
>      src6 = vec_pack(sE, s6);
>      src7 = vec_pack(sF, s7);
>  
> +    if (rangered) {
> +        if (!sign) {
> +            vec_sub(src0, unsigned_bias);
> +            vec_sub(src1, unsigned_bias);
> +            vec_sub(src2, unsigned_bias);
> +            vec_sub(src3, unsigned_bias);
> +            vec_sub(src4, unsigned_bias);
> +            vec_sub(src5, unsigned_bias);
> +            vec_sub(src6, unsigned_bias);
> +            vec_sub(src7, unsigned_bias);
> +        }
> +        vec_sl(src0, rangered_shift);
> +        vec_sl(src1, rangered_shift);
> +        vec_sl(src2, rangered_shift);
> +        vec_sl(src3, rangered_shift);
> +        vec_sl(src4, rangered_shift);
> +        vec_sl(src5, rangered_shift);
> +        vec_sl(src6, rangered_shift);
> +        vec_sl(src7, rangered_shift);
> +    }
> +
>      vec_st(src0,  0, block);
>      vec_st(src1, 16, block);
>      vec_st(src2, 32, block);
> @@ -224,6 +248,38 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
>      vec_st(src7,112, block);
>  }
>  
> +static void vc1_inv_trans_8x8_add_altivec(uint8_t *dest, int stride, DCTELEM *b)
> +{
> +    vc1_inv_trans_8x8_altivec(b);
> +    ff_add_pixels_clamped_c(b, dst, stride);
> +}
> +
> +static void vc1_inv_trans_8x8_put_signed_altivec(uint8_t *dest, int stride, DCTELEM *b)
> +{
> +    vc1_inv_trans_8x8_altivec(b);
> +    ff_put_signed_pixels_clamped_c(b, dst, stride);
> +}
> +
> +static void vc1_inv_trans_8x8_put_signed_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b)
> +{
> +    vc1_inv_trans_8x8_altivec(b);
> +    <<= 1 for each coeff

ahem

> +    ff_put_signed_pixels_clamped_c(b, dst, stride);
> +}
> +
> +static void vc1_inv_trans_8x8_put_altivec(uint8_t *dest, int stride, DCTELEM *b)
> +{
> +    vc1_inv_trans_8x8_altivec(b);
> +    ff_put_pixels_clamped_c(b, dst, stride);
> +}
> +
> +static void vc1_inv_trans_8x8_put_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b)
> +{
> +    vc1_inv_trans_8x8_altivec(b);
> +    -64, then <<= 1 for each coeff

looks like somebody decided that typing even too slashes is too hard

> +    ff_put_pixels_clamped_c(b, dst, stride);
> +}
> +
>  /** Do inverse transform on 8x4 part of block
>  */
>  static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
> @@ -342,7 +398,11 @@ void ff_vc1dsp_init_altivec(VC1DSPContext* dsp)
>      if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
>          return;
>  
> -    dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
> +    dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_altivec;
> +    dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_altivec;
> +    dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_altivec;
> +    dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_altivec;
> +    dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_altivec;
>      dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
>      dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
>      dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;
> diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
> index 8bd6647..27cd010 100644
> --- a/libavcodec/vc1.c
> +++ b/libavcodec/vc1.c
> @@ -280,6 +280,28 @@ static int vop_dquant_decoding(VC1Context *v)
>  
>  static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb);
>  
> +static void simple_idct_put_rangered(uint8_t *dest, int line_size, DCTELEM *block)
> +{
> +    int i;
> +    ff_simple_idct(block);
> +    for (i = 0; i < 64; i++) block[i] = (block[i] - 64) << 1;
> +    ff_put_pixels_clamped_c(block, dest, line_size);
> +}
> +
> +static void simple_idct_put_signed(uint8_t *dest, int line_size, DCTELEM *block)
> +{
> +    ff_simple_idct(block);
> +    ff_put_signed_pixels_clamped_c(block, dest, line_size);
> +}
> +
> +static void simple_idct_put_signed_rangered(uint8_t *dest, int line_size, DCTELEM *block)
> +{
> +    int i;
> +    ff_simple_idct(block);
> +    for (i = 0; i < 64; i++) block[i] <<= 1;
> +    ff_put_signed_pixels_clamped_c(block, dest, line_size);
> +}
> +
>  /**
>   * Decode Simple/Main Profiles sequence header
>   * @see Figure 7-8, p16-17
> @@ -337,7 +359,11 @@ int vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitConte
>      v->res_fasttx = get_bits1(gb);
>      if (!v->res_fasttx)
>      {
> -        v->vc1dsp.vc1_inv_trans_8x8 = ff_simple_idct;
> +        v->vc1dsp.vc1_inv_trans_8x8_add = ff_simple_idct_add;
> +        v->vc1dsp.vc1_inv_trans_8x8_put[0] = ff_simple_idct_put;
> +        v->vc1dsp.vc1_inv_trans_8x8_put[1] = simple_idct_put_rangered;
> +        v->vc1dsp.vc1_inv_trans_8x8_put_signed[0] = simple_idct_put_signed;
> +        v->vc1dsp.vc1_inv_trans_8x8_put_signed[1] = simple_idct_put_signed_rangered;
>          v->vc1dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
>          v->vc1dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
>          v->vc1dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;
> diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
> index a3db6fe..ed92d8c 100644
> --- a/libavcodec/vc1dec.c
> +++ b/libavcodec/vc1dec.c
> @@ -2009,8 +2009,7 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
>              if(i==1)
>                  v->vc1dsp.vc1_inv_trans_8x8_dc(dst, linesize, block);
>              else{
> -                v->vc1dsp.vc1_inv_trans_8x8(block);
> -                s->dsp.add_pixels_clamped(block, dst, linesize);
> +                v->vc1dsp.vc1_inv_trans_8x8_add(dst, linesize, block);
>              }
>              if(apply_filter && cbp_top  & 0xC)
>                  v->vc1dsp.vc1_v_loop_filter8(dst, linesize, v->pq);
> @@ -2117,7 +2116,7 @@ static int vc1_decode_p_mb(VC1Context *v)
>  {
>      MpegEncContext *s = &v->s;
>      GetBitContext *gb = &s->gb;
> -    int i, j;
> +    int i;
>      int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
>      int cbp; /* cbp decoding stuff */
>      int mqdiff, mquant; /* MB quantization */
> @@ -2149,6 +2148,8 @@ static int vc1_decode_p_mb(VC1Context *v)
>      {
>          if (!skipped)
>          {
> +            vc1_idct_func idct8x8_fn;
> +
>              GET_MVDATA(dmv_x, dmv_y);
>  
>              if (s->mb_intra) {
> @@ -2183,6 +2184,7 @@ static int vc1_decode_p_mb(VC1Context *v)
>                                  VC1_TTMB_VLC_BITS, 2);
>              if(!s->mb_intra) vc1_mc_1mv(v, 0);
>              dst_idx = 0;
> +            idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
>              for (i=0; i<6; i++)
>              {
>                  s->dc_val[0][s->block_index[i]] = 0;
> @@ -2200,9 +2202,9 @@ static int vc1_decode_p_mb(VC1Context *v)
>  
>                      vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset);
>                      if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
> -                    v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
> -                    if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
> -                    s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
> +                    idct8x8_fn(s->dest[dst_idx] + off,
> +                               i & 4 ? s->uvlinesize : s->linesize,
> +                               s->block[i]);
>                      if(v->pq >= 9 && v->overlap) {
>                          if(v->c_avail)
>                              v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
> @@ -2267,6 +2269,7 @@ static int vc1_decode_p_mb(VC1Context *v)
>          {
>              int intra_count = 0, coded_inter = 0;
>              int is_intra[6], is_coded[6];
> +            vc1_idct_func idct8x8_fn;
>              /* Get CBPCY */
>              cbp = get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_CBPCY_P_VLC_BITS, 2);
>              for (i=0; i<6; i++)
> @@ -2316,6 +2319,7 @@ static int vc1_decode_p_mb(VC1Context *v)
>              }
>              if (!v->ttmbf && coded_inter)
>                  ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
> +            idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
>              for (i=0; i<6; i++)
>              {
>                  dst_idx += i >> 2;
> @@ -2331,9 +2335,9 @@ static int vc1_decode_p_mb(VC1Context *v)
>  
>                      vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant, (i&4)?v->codingset2:v->codingset);
>                      if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
> -                    v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
> -                    if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
> -                    s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize);
> +                    idct8x8_fn(s->dest[dst_idx] + off,
> +                               (i&4)?s->uvlinesize:s->linesize,
> +                               s->block[i]);
>                      if(v->pq >= 9 && v->overlap) {
>                          if(v->c_avail)
>                              v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
> @@ -2409,7 +2413,7 @@ static void vc1_decode_b_mb(VC1Context *v)
>  {
>      MpegEncContext *s = &v->s;
>      GetBitContext *gb = &s->gb;
> -    int i, j;
> +    int i;
>      int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
>      int cbp = 0; /* cbp decoding stuff */
>      int mqdiff, mquant; /* MB quantization */
> @@ -2422,6 +2426,7 @@ static void vc1_decode_b_mb(VC1Context *v)
>      int skipped, direct;
>      int dmv_x[2], dmv_y[2];
>      int bmvtype = BMV_TYPE_BACKWARD;
> +    vc1_idct_func idct8x8_fn;
>  
>      mquant = v->pq; /* Loosy initialization */
>      s->mb_intra = 0;
> @@ -2519,6 +2524,7 @@ static void vc1_decode_b_mb(VC1Context *v)
>          }
>      }
>      dst_idx = 0;
> +    idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
>      for (i=0; i<6; i++)
>      {
>          s->dc_val[0][s->block_index[i]] = 0;
> @@ -2536,9 +2542,9 @@ static void vc1_decode_b_mb(VC1Context *v)
>  
>              vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset);
>              if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
> -            v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
> -            if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
> -            s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
> +            idct8x8_fn(s->dest[dst_idx] + off,
> +                       i & 4 ? s->uvlinesize : s->linesize,
> +                       s->block[i]);
>          } else if(val) {
>              vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block, s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize, (i&4) && (s->flags & CODEC_FLAG_GRAY), 0, 0, 0);
>              if(!v->ttmbf && ttmb < 8) ttmb = -1;
> @@ -2551,11 +2557,12 @@ static void vc1_decode_b_mb(VC1Context *v)
>   */
>  static void vc1_decode_i_blocks(VC1Context *v)
>  {
> -    int k, j;
> +    int k;
>      MpegEncContext *s = &v->s;
>      int cbp, val;
>      uint8_t *coded_val;
>      int mb_pos;
> +    vc1_idct_func idct8x8_fn;
>  
>      /* select codingmode used for VLC tables selection */
>      switch(v->y_ac_table_index){
> @@ -2590,6 +2597,10 @@ static void vc1_decode_i_blocks(VC1Context *v)
>      s->mb_x = s->mb_y = 0;
>      s->mb_intra = 1;
>      s->first_slice_line = 1;
> +    if(v->pq >= 9 && v->overlap) {
> +        idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
> +    } else
> +        idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put[!!v->rangeredfrm];
>      for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) {
>          s->mb_x = 0;
>          ff_init_block_index(s);
> @@ -2626,14 +2637,9 @@ static void vc1_decode_i_blocks(VC1Context *v)
>                  vc1_decode_i_block(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2);
>  
>                  if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue;
> -                v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
> -                if(v->pq >= 9 && v->overlap) {
> -                    if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] <<= 1;
> -                    s->dsp.put_signed_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
> -                } else {
> -                    if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] = (s->block[k][j] - 64) << 1;
> -                    s->dsp.put_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
> -                }
> +                idct8x8_fn(dst[k],
> +                           k & 4 ? s->uvlinesize : s->linesize,
> +                           s->block[k]);
>              }
>  
>              if(v->pq >= 9 && v->overlap) {
> @@ -2691,6 +2697,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
>      int mqdiff;
>      int overlap;
>      GetBitContext *gb = &s->gb;
> +    vc1_idct_func idct8x8_fn;
>  
>      /* select codingmode used for VLC tables selection */
>      switch(v->y_ac_table_index){
> @@ -2721,6 +2728,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
>      s->mb_x = s->mb_y = 0;
>      s->mb_intra = 1;
>      s->first_slice_line = 1;
> +    idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[0];
>      for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) {
>          s->mb_x = 0;
>          ff_init_block_index(s);
> @@ -2777,9 +2785,9 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
>                  vc1_decode_i_block_adv(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2, mquant);
>  
>                  if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue;
> -                v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
> -                s->dsp.put_signed_pixels_clamped(s->block[k], dst[k],
> -                                                 k & 4 ? s->uvlinesize : s->linesize);
> +                idct8x8_fn(dst[k],
> +                           k & 4 ? s->uvlinesize : s->linesize,
> +                           s->block[k]);
>              }
>  
>              if(overlap) {
> diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
> index 000dad7..dbe2120 100644
> --- a/libavcodec/vc1dsp.c
> +++ b/libavcodec/vc1dsp.c
> @@ -199,7 +199,7 @@ static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
>      }
>  }
>  
> -static void vc1_inv_trans_8x8_c(DCTELEM block[64])
> +static av_always_inline void vc1_inv_trans_8x8_c(DCTELEM block[64], int shl, int sub)
>  {
>      int i;
>      register int t1,t2,t3,t4,t5,t6,t7,t8;
> @@ -254,20 +254,50 @@ static void vc1_inv_trans_8x8_c(DCTELEM block[64])
>          t3 =  9 * src[ 8] - 16 * src[24] +  4 * src[40] + 15 * src[56];
>          t4 =  4 * src[ 8] -  9 * src[24] + 15 * src[40] - 16 * src[56];
>  
> -        dst[ 0] = (t5 + t1) >> 7;
> -        dst[ 8] = (t6 + t2) >> 7;
> -        dst[16] = (t7 + t3) >> 7;
> -        dst[24] = (t8 + t4) >> 7;
> -        dst[32] = (t8 - t4 + 1) >> 7;
> -        dst[40] = (t7 - t3 + 1) >> 7;
> -        dst[48] = (t6 - t2 + 1) >> 7;
> -        dst[56] = (t5 - t1 + 1) >> 7;
> +        dst[ 0] = (((t5 + t1    ) >> 7) - sub) << shl;
> +        dst[ 8] = (((t6 + t2    ) >> 7) - sub) << shl;
> +        dst[16] = (((t7 + t3    ) >> 7) - sub) << shl;
> +        dst[24] = (((t8 + t4    ) >> 7) - sub) << shl;
> +        dst[32] = (((t8 - t4 + 1) >> 7) - sub) << shl;
> +        dst[40] = (((t7 - t3 + 1) >> 7) - sub) << shl;
> +        dst[48] = (((t6 - t2 + 1) >> 7) - sub) << shl;
> +        dst[56] = (((t5 - t1 + 1) >> 7) - sub) << shl;
>  
>          src++;
>          dst++;
>      }
>  }
>  
> +static void vc1_inv_trans_8x8_add_c(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> +    vc1_inv_trans_8x8_c(block, 0, 0);
> +    ff_add_pixels_clamped_c(block, dest, linesize);
> +}
> +
> +static void vc1_inv_trans_8x8_put_signed_c(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> +    vc1_inv_trans_8x8_c(block, 0, 0);
> +    ff_put_signed_pixels_clamped_c(block, dest, linesize);
> +}
> +
> +static void vc1_inv_trans_8x8_put_signed_rangered_c(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> +    vc1_inv_trans_8x8_c(block, 1, 0);
> +    ff_put_signed_pixels_clamped_c(block, dest, linesize);
> +}
> +
> +static void vc1_inv_trans_8x8_put_c(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> +    vc1_inv_trans_8x8_c(block, 0, 0);
> +    ff_put_pixels_clamped_c(block, dest, linesize);
> +}
> +
> +static void vc1_inv_trans_8x8_put_rangered_c(uint8_t *dest, int linesize, DCTELEM *block)
> +{
> +    vc1_inv_trans_8x8_c(block, 1, 64);
> +    ff_put_pixels_clamped_c(block, dest, linesize);
> +}
> +
>  /** Do inverse transform on 8x4 part of block
>  */
>  static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
> @@ -662,7 +692,11 @@ static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
>  }
>  
>  av_cold void ff_vc1dsp_init(VC1DSPContext* dsp) {
> -    dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_c;
> +    dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_c;
> +    dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_c;
> +    dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_c;
> +    dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_c;
> +    dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_c;
>      dsp->vc1_inv_trans_4x8 = vc1_inv_trans_4x8_c;
>      dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_c;
>      dsp->vc1_inv_trans_4x4 = vc1_inv_trans_4x4_c;
> diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
> index a1f3d90..b8da184 100644
> --- a/libavcodec/vc1dsp.h
> +++ b/libavcodec/vc1dsp.h
> @@ -30,9 +30,12 @@
>  
>  #include "dsputil.h"
>  
> +typedef void (*vc1_idct_func)(uint8_t *dest, int line_size, DCTELEM *block);
>  typedef struct VC1DSPContext {
>      /* vc1 functions */
> -    void (*vc1_inv_trans_8x8)(DCTELEM *b);
> +    vc1_idct_func vc1_inv_trans_8x8_add;
> +    vc1_idct_func vc1_inv_trans_8x8_put_signed[2];
> +    vc1_idct_func vc1_inv_trans_8x8_put[2];
>      void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block);
>      void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block);
>      void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);
> -- 
> 1.7.2.1

beside those un-comments looks ok