[FFmpeg-devel] [PATCH]levc/hevc_cabac Optimise ff_hevc_hls_residual_coding (v2)

Fri Jan 22 19:33:32 CET 2016

On Fri, 22 Jan 2016 18:52:23 +0100, you wrote:

>Hi,
>
>2016-01-21 11:45 GMT+01:00 John Cox <jc at kynesim.co.uk>:
>> Hi
>>
>> v2 of my hevc residual patch
>
>I'll review the bit not related to significant coeffs first, because I
>think it is the most performance-sensitive. Also there are bits that
>could be moved to other patches, at least some are related to the
>later bypass patch. Here's a list you'll see detailed below:
>- coefficient saturation, which I think is OK to commit
>- bypass-related stuff
>- boolean stuff (!!stuff), which I think is OK to commit
>- cosmetics (like renaming a variable or introducing a shorthand)
>- sig(nificant coefficients )map
>
>The fact is I've benchmarked parts of the code and seeing slowdowns as
>well as speedups on x86_64, hence why it would be nice to be able to
>test and evaluate each of those parts separately.

Fair enough - though given that your slowdowns are almost certainly
cache-related the whole may be quite different from the sum of the
parts.

>> +// Helper fns
>> +#ifndef hevc_mem_bits32
>> +static av_always_inline uint32_t hevc_mem_bits32(const void * buf,
>> const unsigned int offset)
>> +{
>> +    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
>> +}
>> +#endif
>> +
>> +#if !defined(hevc_clz32)
>> +#define hevc_clz32 hevc_clz32_builtin
>> +static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
>> +{
>> +    // ff_clz says works on ints (probably) - so adjust if int is >32 bits long
>> +    // the fact that x is passed in as uint32_t will have cleared the top bits
>> +    return ff_clz(x) - (sizeof(int) * 8 - 32);
>> +}
>> +#endif
>> +
>> +#define bypass_start(s)
>> +#define bypass_finish(s)
>
>bypass-related?
>
>>  void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
>>  {
>>      if (s->ps.pps->entropy_coding_sync_enabled_flag &&
>> @@ -863,19 +928,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int
>> trafo_depth)
>>      return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
>>  }
>>
>> -static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
>> +static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
>>  {
>> -    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
>> +    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
>>  }
>>
>> -static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
>> +static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
>>  {
>> -    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
>> +    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
>>  }
>>
>> -static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
>> +static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
>>  {
>> -    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
>> +    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
>>  }
>
>Boolean stuff. Ideally, the whole boolean stuff topic would be better
>as a separate patch, with which I would be OK.
>
>>  int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
>> @@ -891,14 +956,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
>>      return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
>>  }
>>
>> -static av_always_inline void
>> last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
>> +static av_always_inline void
>> last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
>>                                                     int log2_size, int
>> *last_scx_prefix, int *last_scy_prefix)
>>  {
>>      int i = 0;
>>      int max = (log2_size << 1) - 1;
>>      int ctx_offset, ctx_shift;
>>
>> -    if (!c_idx) {
>> +    if (!c_idx_nz) {
>>          ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
>>          ctx_shift = (log2_size + 1) >> 2;
>>      } else {
>> @@ -929,22 +994,16 @@ static av_always_inline int
>> last_significant_coeff_suffix_decode(HEVCContext *s,
>>      return value;
>>  }
>>
>> -static av_always_inline int
>> significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int
>> ctx_cg)
>> +static av_always_inline int
>> significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int
>> ctx_cg)
>
>cosmetics?

I renamed the variable, because c_idx can have values 0..2 and c_idx_nz
is a boolean with 0..1 and in the rewrite of the inc var it is important
that we are using the _nz variant so having the var named appropriately
seemed sensible.

>>  {
>>      int inc;
>>
>> -    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
>> +    inc = (ctx_cg != 0) + (c_idx_nz << 1);
>>
>>      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
>>  }
>
>boolean stuff
>
>> -static av_always_inline int significant_coeff_flag_decode(HEVCContext
>> *s, int x_c, int y_c,
>> -                                           int offset, const uint8_t
>> *ctx_idx_map)
>> -{
>> -    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
>> -    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
>> -}
>
>sigmap
>
>> -static av_always_inline int
>> significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
>> +static av_always_inline int
>> significant_coeff_flag_decode_0(HEVCContext *s, int offset)
>>  {
>>      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
>>  }
>> @@ -966,65 +1025,305 @@ static av_always_inline int
>> coeff_abs_level_greater2_flag_decode(HEVCContext *s,
>>      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
>>  }
>>
>> -static av_always_inline int
>> coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
>> +
>> +#if !USE_BY22
>> +#define coeff_abs_level_remaining_decode_bypass(s,r)
>> coeff_abs_level_remaining_decode(s, r)
>> +#endif
>> +
>> +
>> +#ifndef coeff_abs_level_remaining_decode_bypass
>> +static int coeff_abs_level_remaining_decode_bypass(HEVCContext *
>> const s, const unsigned int rice_param)
>>  {
>> +    CABACContext * const c = &s->HEVClc->cc;
>> +    uint32_t y;
>> +    unsigned int prefix;
>> +    unsigned int last_coeff_abs_level_remaining;
>> +    unsigned int n;
>> +
>> +    y = get_cabac_by22_peek(c);
>> +    prefix = hevc_clz32(~y);
>> +    // y << prefix will always have top bit 0
>> +
>> +    if (prefix < 3) {
>> +        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
>> +        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
>> +        n = prefix + 1 + rice_param;
>> +    }
>> +    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
>> +    {
>> +        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 -
>> (prefix + rice_param));
>> +
>> +        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
>> +        n = prefix * 2 + rice_param - 2;
>> +    }
>> +    else {
>> +        unsigned int suffix;
>> +
>> +        get_cabac_by22_flush(c, prefix, y);
>> +        y = get_cabac_by22_peek(c);
>> +
>> +        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
>> +        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
>> +        n = prefix + rice_param - 2;
>> +    }
>> +
>> +    get_cabac_by22_flush(c, n, y);
>> +
>> +    return last_coeff_abs_level_remaining;
>> +}
>> +#endif
>
>I think USE_BY22 is not yet defined?
>Related to bypass.

Ah - good point - I did mean to put those bits into the bypass patch but
I seem to have failed.

>> +
>> +static int coeff_abs_level_remaining_decode(HEVCContext * const s,
>> int rc_rice_param)
>> +{
>> +    CABACContext * const c = &s->HEVClc->cc;
>>      int prefix = 0;
>>      int suffix = 0;
>>      int last_coeff_abs_level_remaining;
>>      int i;
>>
>> -    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
>> +    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
>>          prefix++;
>>      if (prefix == CABAC_MAX_BIN) {
>>          av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
>>          return 0;
>>      }
>> +
>>      if (prefix < 3) {
>>          for (i = 0; i < rc_rice_param; i++)
>> -            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
>> +            suffix = (suffix << 1) | get_cabac_bypass(c);
>>          last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
>>      } else {
>>          int prefix_minus3 = prefix - 3;
>>          for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
>> -            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
>> +            suffix = (suffix << 1) | get_cabac_bypass(c);
>>          last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
>>                                                << rc_rice_param) + suffix;
>>      }
>> +
>>      return last_coeff_abs_level_remaining;
>>  }
>
>Cosmetics.
True

>> -static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
>> +#if !USE_BY22
>> +#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
>> +static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s,
>> const unsigned int nb)
>>  {
>> -    int i;
>> -    int ret = 0;
>> +    CABACContext * const c = &s->HEVClc->cc;
>> +    unsigned int i;
>> +    uint32_t ret = 0;
>>
>>      for (i = 0; i < nb; i++)
>> -        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
>> -    return ret;
>> +        ret = (ret << 1) | get_cabac_bypass(c);
>> +
>> +    return ret << (32 - nb);
>> +}
>> +#endif
>> +
>> +#ifndef coeff_sign_flag_decode_bypass
>> +static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext *
>> const s, const unsigned int nb)
>> +{
>> +    CABACContext * const c = &s->HEVClc->cc;
>> +    uint32_t y;
>> +    y = get_cabac_by22_peek(c);
>> +    get_cabac_by22_flush(c, nb, y);
>> +    return y & ~(0xffffffffU >> nb);
>> +}
>> +#endif
>
>bypass
Indeed - and was meant to be in a different patch

>> +#ifndef get_cabac_greater1_bits
>> +static inline unsigned int get_cabac_greater1_bits(CABACContext *
>> const c, const unsigned int n,
>> +    uint8_t * const state0)
>> +{
>> +    unsigned int i;
>> +    unsigned int rv = 0;
>> +    for (i = 0; i != n; ++i) {
>> +        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
>> +        const unsigned int b = get_cabac(c, state0 + idx);
>> +        rv = (rv << 1) | b;
>> +    }
>> +    return rv;
>> +}
>> +#endif
>
>I suppose branch prediction could help here, but less likely than for
>get_cabac_sig_coeff_flag_idxs.
>
>Also for this and some others: why inline over av_always_inline?
No particularly good reason for this one - though for any fn that might
be called from multiple places there is a strong argument for just
"inline" as it allows the compiler to make a judgment call based on how
big L1 cache is and how bad the call penalty.

>> +// N.B. levels returned are the values assuming coeff_abs_level_remaining
>> +// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
>> +// this version of events.
>> +static inline uint32_t get_greaterx_bits(HEVCContext * const s, const
>> unsigned int n_end, int * const levels,
>> +    int * const pprev_subset_coded, int * const psum,
>> +    const unsigned int idx0_gt1, const unsigned int idx_gt2)
>> +{
>> +    CABACContext * const c = &s->HEVClc->cc;
>> +    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
>> +    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
>> +    unsigned int rv;
>> +    unsigned int i;
>> +    const unsigned int n = FFMIN(n_end, 8);
>> +
>> +    // Really this is i != n but the simple unconditional loop is cheaper
>> +    // and faster
>> +    for (i = 0; i != 8; ++i)
>> +        levels[i] = 1;
>
>AV_WN64 of 0x0101010101010101ULL, or even a memset, as it would be
>inlined by the compiler, given its size, and done the best possible
>way.

levels is int *, not char *

>Also the function is related to but not strictly sigmap. Should be ok
>to be in the same patch, though.
>
>> +#ifndef trans_scale_sat
>> +static inline int trans_scale_sat(const int level, const unsigned int
>> scale, const unsigned int scale_m, const unsigned int shift)
>> +{
>> +    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift)
>> + 1) >> 1);
>> +}
>> +#endif
>
>Saturation, could be a separate patch, with which I would be ok.
>
>> +#ifndef update_rice
>> +static inline void update_rice(uint8_t * const stat_coeff,
>> +                              const unsigned int
>> last_coeff_abs_level_remaining,
>> +                              const unsigned int c_rice_param)
>> +{
>> +    const unsigned int x = (last_coeff_abs_level_remaining << 1) >>
>> c_rice_param;
>> +    if (x >= 6)
>> +        (*stat_coeff)++;
>> +    else if (x == 0 && *stat_coeff > 0)
>> +        (*stat_coeff)--;
>> +}
>> +#endif
>
>Related to but not strictly bypass ?

Not bypass per se, more the general optimisation of abs_level_remaining
- it is pulled out because I had a slightly better arm asm version of
the fn.  So it could go in that patch, but this allows other asm to
override it if they so desire.

>> +// n must be > 0 on entry
>> +#ifndef get_cabac_sig_coeff_flag_idxs
>> +static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext *
>> const c, uint8_t * const state0,
>> +                                                     unsigned int n,
>> +                                                     const uint8_t
>> const * ctx_map,
>> +                                                     uint8_t * p)
>> +{
>> +    do {
>> +        if (get_cabac(c, state0 + ctx_map[n]))
>> +            *p++ = n;
>> +    } while (--n != 0);
>> +    return p;
>> +}
>> +#endif
>
>Doing:
>    if (get_cabac(c, state0 + ctx_map[n]))
>        *p++ = n;
>    while (--n != 0) {
>        if (get_cabac(c, state0 + ctx_map[n]))
>            *p++ = n;
>    }
>is most likely faster, probably also on arm, if the branch prediction is good.

Not convinced.  That will increase code size (as get_cabac will inline)
which can be pure poison as you have found out with USE_N_END_1.

>For this one as well as the 3 following I left, I think
>av_always_inline is more important than some other functions that
>instead should be inline (eg left to the compiler to decide).

Fine

>> +
>> +
>> +static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t *
>> const state0,
>> +    unsigned int n,
>> +    const uint8_t const * ctx_map,
>> +    uint8_t * const flag_idx)
>> +{
>> +    int rv;
>> +
>> +    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map,
>> flag_idx) - flag_idx;
>> +
>> +    return rv;
>>  }
>
>The x86 h264 implementation does it slightly differently but it's hard
>to say which would be the best.
>Let's say arm is more in need of the best performance here.
>
>> +#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12,
>> x13, x14, x15) {\
>> +     x0,  x1,  x2,  x3,\
>> +     x4,  x5,  x6,  x7,\
>> +     x8,  x9, x10, x11,\
>> +    x12, x13, x14, x15}
>> +
>> +#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12,
>> x13, x14, x15) {\
>> +     x0,  x4,  x8, x12,\
>> +     x1,  x5,  x9, x13,\
>> +     x2,  x6, x10, x14,\
>> +     x3,  x7, x11, x15}
>> +
>> +#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12,
>> x13, x14, x15) {\
>> +     x0,  x4,  x1,  x8,\
>> +     x5,  x2, x12,  x9,\
>> +     x6,  x3, x13, x10,\
>> +     x7, x14, x11, x15}
>
>sigmap
>
>So, this allows reducing the indirection (lut1[lut2[]]), which is useful.
>
>I would have done it the same way, but it's sufficiently important
>that it'd be nice to have someone else's opinion on how to do it.
>
>> +static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
>> +                             uint8_t * const significant_coeff_group_flag,
>> +                             const uint8_t * const scan_x_cg, const
>> uint8_t * const scan_y_cg,
>> +                             int * const pPrev_sig)
>> +{
>> +    while (--i >= 0) {
>> +        unsigned int x_cg = scan_x_cg[i];
>> +        unsigned int y_cg = scan_y_cg[i];
>> +
>> +        // For the flag decode we only care about Z/NZ but
>> +        // we use the full Right + Down * 2 when calculating
>> +        // significant coeff flags so we obtain it here.
>> +        //
>> +        // The group flag array is one longer than it needs to
>> +        // be so we don't need to check for y_cg limits
>> +        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg]
>>>> (x_cg + 1)) & 1) |
>> +            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
>> +
>> +        if (i == 0 ||
>> +            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
>> +        {
>> +            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
>> +            *pPrev_sig = prev_sig;
>> +            break;
>> +        }
>> +    }
>> +
>> +    return i;
>> +}
>
>Not strictly sigmap, but related to. Ok to be in the same patch or another.
>
>> +    const int c_idx_nz = (c_idx != 0);
>
>boolean? (not sure)
I saw no booleans anywhere in the rest of this code so assumed they were
(at best) deprecated.  But if there is an official ffmpeg boolean then
that is what it should be.

>> +
>> +    int may_hide_sign;
>> +
>
>This and related are a slightly separate topic from the other stuff,
>but ok within any of the patches.
>
>> -        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
>> -        add      = 1 << (shift-1);
>> -        scale    = level_scale[rem6[qp]] << (div6[qp]);
>> -        scale_m  = 16; // default when no custom scaling lists.
>> -        dc_scale = 16;
>> +        // Shift is set to one less than will actually occur as the scale
>> +        // and saturate step adds 1 and then shifts right again
>> +        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
>> +        scale = level_scale[rem6[qp]];
>> +        if (div6[qp] >= shift) {
>> +            scale <<= (div6[qp] - shift);
>> +            shift = 0;
>> +        } else {
>> +            shift -= div6[qp];
>> +        }
>
>Saturation?
>
>> +            dc_scale = scale_matrix[0];
>>              if (log2_trafo_size >= 4)
>>                  dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
>>          }
>> +        else
>> +        {
>> +            static const uint8_t sixteen_scale[64] = {
>> +                16, 16, 16, 16, 16, 16, 16, 16,
>> +                16, 16, 16, 16, 16, 16, 16, 16,
>[...]
>> +            1, 1, 1, 1, 1, 1, 1, 1,
>> +            1, 1, 1, 1, 1, 1, 1, 1,
>> +            1, 1, 1, 1, 1, 1, 1, 1,
>> +        };
>> +        scale_matrix = unit_scale;
>>          shift        = 0;
>> -        add          = 0;
>> -        scale        = 0;
>> -        dc_scale     = 0;
>> +        scale        = 2;  // We will shift right to kill this
>> +        dc_scale     = 1;
>
>Saturation?
>
>> +#if USE_N_END_1
>> +            if (nb_significant_coeff_flag == 1) {
>> +                // There is a small gain to be had from special
>
>So we decided to have #define USE_N_END_1 ARCH_ARM. As said in another
>mail, there's a 10-20% loss for the codeblock.
>
>USE_BY22_DIV also strangely provides no benefit.
Slightly surprised by that, but DIV is going to produce smaller code so
should be preferred on general grounds as it puts off cache disasters.

>Best regards,
Off on holiday now, so this is going to be my last post till Monday 1st.

Regards & thanks

JC