[FFmpeg-devel] [PATCH] avcodec/vp9: Add tile threading support
Ronald S. Bultje
rsbultje at gmail.com
Wed Aug 23 19:11:17 EEST 2017
Hi,
now for review of the vp9-specific bits.
On Tue, Aug 22, 2017 at 7:59 PM, Ilia Valiakhmetov <zakne0ne at gmail.com>
wrote:
> + lflvl_len*sizeof(*s->lflvl) + 16 *
> sizeof(*s->above_mv_ctx)));
>
lflvl_len * sizeof (space).
> - // these will be re-allocated a little later
> - av_freep(&s->b_base);
> - av_freep(&s->block_base);
>
I don't think you can remove that. I understand that td may or may not be
allocated here, so you probably need an if or so, but its critically
important that we re-allocate these upon a size change, so this behaviour
probably needs to stay in.
> } else {
> - s->b_base = av_malloc(sizeof(VP9Block));
> - s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) *
> bytesperpixel * sizeof(int16_t) +
> - 16 * 16 + 2 * chroma_eobs);
> - if (!s->b_base || !s->block_base)
> - return AVERROR(ENOMEM);
> - s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
> - s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks *
> bytesperpixel;
> - s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks *
> bytesperpixel);
> - s->uveob_base[0] = s->eob_base + 16 * 16;
> - s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
> + for (i = 1; i < s->s.h.tiling.tile_cols; i++) {
> + if (s->td[i].b_base && s->td[i].block_base) {
> + av_free(s->td[i].b_base);
> + av_free(s->td[i].block_base);
> + }
> + }
> + for (i = 0; i < s->s.h.tiling.tile_cols; i++) {
> + s->td[i].b_base = av_malloc(sizeof(VP9Block));
> + s->td[i].block_base = av_mallocz((64 * 64 + 2 *
> chroma_blocks) * bytesperpixel * sizeof(int16_t) +
> + 16 * 16 + 2 * chroma_eobs);
> + if (!s->td[i].b_base || !s->td[i].block_base)
> + return AVERROR(ENOMEM);
> + s->td[i].uvblock_base[0] = s->td[i].block_base + 64 * 64 *
> bytesperpixel;
> + s->td[i].uvblock_base[1] = s->td[i].uvblock_base[0] +
> chroma_blocks * bytesperpixel;
> + s->td[i].eob_base = (uint8_t *) (s->td[i].uvblock_base[1] +
> chroma_blocks * bytesperpixel);
> + s->td[i].uveob_base[0] = s->td[i].eob_base + 16 * 16;
> + s->td[i].uveob_base[1] = s->td[i].uveob_base[0] + chroma_eobs;
> + }
>
This makes the non-threaded decoder use a lot more memory, I'm not sure
that's a good idea, it's certainly not required, since I believe the
codepath of non-threaded is shared with frame-mt, so it only uses s->td[0].
Can we allocate less memory here if there's no frame-threading at all?
> +static int decode_tiles(AVCodecContext *avctx)
> +{
> + VP9Context *s = avctx->priv_data;
> + VP9TileData *td = &s->td[0];
> + int row, col, tile_row, tile_col;
> + int bytesperpixel;
> + int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
> + AVFrame *f;
> + ptrdiff_t yoff, uvoff, ls_y, ls_uv;
> +
> + f = s->s.frames[CUR_FRAME].tf.f;
> + ls_y = f->linesize[0];
> + ls_uv =f->linesize[1];
> + bytesperpixel = s->bytesperpixel;
> +
> + yoff = uvoff = 0;
> + for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
> + set_tile_offset(&tile_row_start, &tile_row_end,
> + tile_row, s->s.h.tiling.log2_tile_rows,
> s->sb_rows);
> +
> + for (row = tile_row_start; row < tile_row_end;
> + row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v)
> {
> + VP9Filter *lflvl_ptr = s->lflvl;
> + ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
> +
> + for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols;
> tile_col++) {
> + set_tile_offset(&tile_col_start, &tile_col_end,
> + tile_col, s->s.h.tiling.log2_tile_cols,
> s->sb_cols);
> + td->tile_col_start = tile_col_start;
> + if (s->pass != 2) {
> + memset(td->left_partition_ctx, 0, 8);
> + memset(td->left_skip_ctx, 0, 8);
> + if (s->s.h.keyframe || s->s.h.intraonly) {
> + memset(td->left_mode_ctx, DC_PRED, 16);
> + } else {
> + memset(td->left_mode_ctx, NEARESTMV, 8);
> + }
> + memset(td->left_y_nnz_ctx, 0, 16);
> + memset(td->left_uv_nnz_ctx, 0, 32);
> + memset(td->left_segpred_ctx, 0, 8);
> +
> + memcpy(&td->c, &s->td[tile_col].c_b[tile_row],
> sizeof(td->c));
> + }
> +
> + for (col = tile_col_start;
> + col < tile_col_end;
> + col += 8, yoff2 += 64 * bytesperpixel,
> + uvoff2 += 64 * bytesperpixel >> s->ss_h,
> lflvl_ptr++) {
> + // FIXME integrate with lf code (i.e. zero after each
> + // use, similar to invtxfm coefficients, or similar)
> + if (s->pass != 1) {
> + memset(lflvl_ptr->mask, 0,
> sizeof(lflvl_ptr->mask));
> + }
> +
> + if (s->pass == 2) {
> + decode_sb_mem(td, row, col, lflvl_ptr,
> + yoff2, uvoff2, BL_64X64);
> + } else {
> + decode_sb(td, row, col, lflvl_ptr,
> + yoff2, uvoff2, BL_64X64);
> + }
> + }
> + if (s->pass != 2)
> + memcpy(&s->td[tile_col].c_b[tile_row], &td->c,
> sizeof(td->c));
> + }
> +
> + if (s->pass == 1)
> + continue;
> +
> + // backup pre-loopfilter reconstruction data for intra
> + // prediction of next row of sb64s
> + if (row + 8 < s->rows) {
> + memcpy(s->intra_pred_data[0],
> + f->data[0] + yoff + 63 * ls_y,
> + 8 * s->cols * bytesperpixel);
> + memcpy(s->intra_pred_data[1],
> + f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
> + 8 * s->cols * bytesperpixel >> s->ss_h);
> + memcpy(s->intra_pred_data[2],
> + f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
> + 8 * s->cols * bytesperpixel >> s->ss_h);
> + }
> +
> + // loopfilter one row
> + if (s->s.h.filter.level) {
> + yoff2 = yoff;
> + uvoff2 = uvoff;
> + lflvl_ptr = s->lflvl;
> + for (col = 0; col < s->cols;
> + col += 8, yoff2 += 64 * bytesperpixel,
> + uvoff2 += 64 * bytesperpixel >> s->ss_h,
> lflvl_ptr++) {
> + ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col,
> + yoff2, uvoff2);
> + }
> + }
> +
> + // FIXME maybe we can make this more finegrained by running
> the
> + // loopfilter per-block instead of after each sbrow
> + // In fact that would also make intra pred left preparation
> easier?
> + ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >>
> 3, 0);
> + }
> + }
> + return 0;
> +}
> +
> +
> +static av_always_inline
> +int decode_tiles_mt(AVCodecContext *avctx, void *tdata, int jobnr,
> + int threadnr)
> +{
> + VP9Context *s = avctx->priv_data;
> + VP9TileData *td = &s->td[jobnr];
> + ptrdiff_t uvoff, yoff, ls_y, ls_uv;
> + int bytesperpixel = s->bytesperpixel, row, col, tile_row;
> + unsigned tile_cols_len;
> + int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
> + VP9Filter *lflvl_ptr_base;
> + AVFrame *f;
> +
> + f = s->s.frames[CUR_FRAME].tf.f;
> + ls_y = f->linesize[0];
> + ls_uv =f->linesize[1];
> +
> + set_tile_offset(&tile_col_start, &tile_col_end,
> + jobnr, s->s.h.tiling.log2_tile_cols, s->sb_cols);
> + td->tile_col_start = tile_col_start;
> + uvoff = (64 * bytesperpixel >> s->ss_h)*(tile_col_start >> 3);
> + yoff = (64 * bytesperpixel)*(tile_col_start >> 3);
> + lflvl_ptr_base = s->lflvl+(tile_col_start >> 3);
> +
> + for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
> + set_tile_offset(&tile_row_start, &tile_row_end,
> + tile_row, s->s.h.tiling.log2_tile_rows,
> s->sb_rows);
> +
> + memcpy(&td->c, &td->c_b[tile_row], sizeof(td->c));
> + for (row = tile_row_start; row < tile_row_end;
> + row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v)
> {
> + ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
> + VP9Filter *lflvl_ptr = lflvl_ptr_base+s->sb_cols*(row >> 3);
> +
> + memset(td->left_partition_ctx, 0, 8);
> + memset(td->left_skip_ctx, 0, 8);
> + if (s->s.h.keyframe || s->s.h.intraonly) {
> + memset(td->left_mode_ctx, DC_PRED, 16);
> + } else {
> + memset(td->left_mode_ctx, NEARESTMV, 8);
> + }
> + memset(td->left_y_nnz_ctx, 0, 16);
> + memset(td->left_uv_nnz_ctx, 0, 32);
> + memset(td->left_segpred_ctx, 0, 8);
> +
> + for (col = tile_col_start;
> + col < tile_col_end;
> + col += 8, yoff2 += 64 * bytesperpixel,
> + uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
> + // FIXME integrate with lf code (i.e. zero after each
> + // use, similar to invtxfm coefficients, or similar)
> + memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
> + decode_sb(td, row, col, lflvl_ptr,
> + yoff2, uvoff2, BL_64X64);
> + }
> +
> + // backup pre-loopfilter reconstruction data for intra
> + // prediction of next row of sb64s
> + tile_cols_len = tile_col_end - tile_col_start;
> + if (row + 8 < s->rows) {
> + memcpy(s->intra_pred_data[0] + (tile_col_start * 8 *
> bytesperpixel),
> + f->data[0] + yoff + 63 * ls_y,
> + 8 * tile_cols_len * bytesperpixel);
> + memcpy(s->intra_pred_data[1] + (tile_col_start * 8 *
> bytesperpixel >> s->ss_h),
> + f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
> + 8 * tile_cols_len * bytesperpixel >> s->ss_h);
> + memcpy(s->intra_pred_data[2] + (tile_col_start * 8 *
> bytesperpixel >> s->ss_h),
> + f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
> + 8 * tile_cols_len * bytesperpixel >> s->ss_h);
> + }
> +
> + ff_thread_report_progress2(avctx, row >> 3, 0, 1);
> + }
> + }
> + return 0;
> +}
> +
> +static av_always_inline
> +int loopfilter_proc(AVCodecContext *avctx)
> +{
> + VP9Context *s = avctx->priv_data;
> + ptrdiff_t uvoff, yoff, ls_y, ls_uv;
> + VP9Filter *lflvl_ptr;
> + int bytesperpixel = s->bytesperpixel, col, i;
> + AVFrame *f;
> +
> + f = s->s.frames[CUR_FRAME].tf.f;
> + ls_y = f->linesize[0];
> + ls_uv =f->linesize[1];
> +
> + for (i = 0; i < s->sb_rows; i++) {
> + ff_thread_await_progress3(avctx, i, 0, s->s.h.tiling.tile_cols);
> +
> + if (s->s.h.filter.level) {
> + yoff = (ls_y * 64)*i;
> + uvoff = (ls_uv * 64 >> s->ss_v)*i;
> + lflvl_ptr = s->lflvl+s->sb_cols*i;
> + for (col = 0; col < s->cols;
> + col += 8, yoff += 64 * bytesperpixel,
> + uvoff += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
> + ff_vp9_loopfilter_sb(avctx, lflvl_ptr, i << 3, col,
> + yoff, uvoff);
> + }
> + }
> + }
> return 0;
> }
>
Hm... There is some duplicate code here... I guess it's OK.
+ if (avctx->active_thread_type == FF_THREAD_SLICE)
> + for (i = 1; i < s->s.h.tiling.tile_cols; i++)
> + for (j = 0; j < sizeof(s->td[i].counts) /
> sizeof(unsigned); j++)
> + ((unsigned *)&s->td[0].counts)[j] += ((unsigned
> *)&s->td[i].counts)[j];
Add a comment so people understand what this does, this is kind of cryptic.
> +typedef struct VP9TileData {
> + VP9Context *s;
>
Can this be marked const?
> + VP56RangeCoder c_b[4];
> + VP56RangeCoder c;
>
Why do you need 'c', and can that be a pointer into one of the c_b entries?
Ronald
More information about the ffmpeg-devel
mailing list