[FFmpeg-cvslog] avcodec/hevc: reduce memory for SAO

Wed Feb 4 02:06:59 CET 2015

ffmpeg | branch: master | Fabrice Bellard <fabrice at bellard.org> | Mon Jan 12 23:09:23 2015 +0100| [da81cc38e8a90071484508fbdd8fb888dc9a4b7b] | committer: Michael Niedermayer

avcodec/hevc: reduce memory for SAO

cherry picked from commit 5d9f79edef2c11b915bdac3a025b59a32082f409

SAO edge filter uses pre-SAO pixel data on the left and top of the ctb, so
this data must be kept available. This was done previously by having 2
copies of the frame, one before and one after SAO.

This commit reduces the storage to just that, instead of the previous whole
frame.

Commit message taken from patch by Christophe Gisquet <christophe.gisquet at gmail.com>

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=da81cc38e8a90071484508fbdd8fb888dc9a4b7b
---

 libavcodec/hevc.c        |   40 +++++++++-
 libavcodec/hevc.h        |   14 +++-
 libavcodec/hevc_filter.c |  184 +++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 224 insertions(+), 14 deletions(-)

diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index 9459026..a940f0b 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -104,7 +104,8 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
 
     s->cbf_luma = av_malloc_array(sps->min_tb_width, sps->min_tb_height);
     s->tab_ipm  = av_mallocz(min_pu_size);
-    s->is_pcm   = av_malloc_array(sps->min_pu_width + 1, sps->min_pu_height + 1);
+    s->is_pcm   = av_mallocz_array(sps->min_pu_width + 1, sps->min_pu_height + 1);
+
     if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm)
         goto fail;
 
@@ -353,9 +354,34 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps)
     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
 
     if (sps->sao_enabled && !s->avctx->hwaccel) {
+#ifdef USE_SAO_SMALL_BUFFER
+        {
+            int ctb_size = 1 << sps->log2_ctb_size;
+            int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+            int c_idx, i;
+
+            for (i = 0; i < s->threads_number ; i++) {
+                HEVCLocalContext    *lc = s->HEVClcList[i];
+                lc->sao_pixel_buffer =
+                    av_malloc(((ctb_size + 2) * (ctb_size + 2)) <<
+                              sps->pixel_shift);
+            }
+            for(c_idx = 0; c_idx < c_count; c_idx++) {
+                int w = sps->width >> sps->hshift[c_idx];
+                int h = sps->height >> sps->vshift[c_idx];
+                s->sao_pixel_buffer_h[c_idx] =
+                av_malloc((w * 2 * sps->ctb_height) <<
+                          sps->pixel_shift);
+                s->sao_pixel_buffer_v[c_idx] =
+                av_malloc((h * 2 * sps->ctb_width) <<
+                          sps->pixel_shift);
+            }
+        }
+#else
         av_frame_unref(s->tmp_frame);
         ret = get_buffer_sao(s, s->tmp_frame, sps);
         s->sao_frame = s->tmp_frame;
+#endif
     }
 
     s->sps = sps;
@@ -3186,7 +3212,17 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
 
     av_freep(&s->cabac_state);
 
+#ifdef USE_SAO_SMALL_BUFFER
+    for (i = 0; i < s->threads_number; i++) {
+        av_freep(&s->HEVClcList[i]->sao_pixel_buffer);
+    }
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->sao_pixel_buffer_h[i]);
+        av_freep(&s->sao_pixel_buffer_v[i]);
+    }
+#else
     av_frame_free(&s->tmp_frame);
+#endif
     av_frame_free(&s->output_frame);
 
     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
@@ -3246,9 +3282,11 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
     if (!s->cabac_state)
         goto fail;
 
+#ifndef USE_SAO_SMALL_BUFFER
     s->tmp_frame = av_frame_alloc();
     if (!s->tmp_frame)
         goto fail;
+#endif
 
     s->output_frame = av_frame_alloc();
     if (!s->output_frame)
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index 1727b60..482341a 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -36,6 +36,8 @@
 #include "thread.h"
 #include "videodsp.h"
 
+//#define USE_SAO_SMALL_BUFFER /* reduce the memory used by SAO */
+
 #define MAX_DPB_SIZE 16 // A.4.1
 #define MAX_REFS 16
 
@@ -745,6 +747,9 @@ typedef struct HEVCNAL {
 } HEVCNAL;
 
 typedef struct HEVCLocalContext {
+#ifdef USE_SAO_SMALL_BUFFER
+    uint8_t *sao_pixel_buffer;
+#endif
     uint8_t cabac_state[HEVC_CONTEXTS];
 
     uint8_t stat_coeff[4];
@@ -807,9 +812,14 @@ typedef struct HEVCContext {
     uint8_t slice_initialized;
 
     AVFrame *frame;
-    AVFrame *sao_frame;
-    AVFrame *tmp_frame;
     AVFrame *output_frame;
+#ifdef USE_SAO_SMALL_BUFFER
+    uint8_t *sao_pixel_buffer_h[3];
+    uint8_t *sao_pixel_buffer_v[3];
+#else
+    AVFrame *tmp_frame;
+    AVFrame *sao_frame;
+#endif
 
     const HEVCVPS *vps;
     const HEVCSPS *sps;
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index 90bf1fe..b8aa71e 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -139,7 +139,7 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
     return s->qp_y_tab[x + y * s->sps->min_cb_width];
 }
 
-static void copy_CTB(uint8_t *dst, uint8_t *src, int width, int height,
+static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
                      intptr_t stride_dst, intptr_t stride_src)
 {
 int i, j;
@@ -161,13 +161,65 @@ int i, j;
     }
 }
 
-static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int height, int c_idx)
+#if defined(USE_SAO_SMALL_BUFFER)
+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+{
+    if (pixel_shift)
+        *(uint16_t *)dst = *(uint16_t *)src;
+    else
+        *dst = *src;
+
+}
+
+static void copy_vert(uint8_t *dst, const uint8_t *src,
+                      int pixel_shift, int height,
+                      int stride_dst, int stride_src)
+{
+    int i;
+    if (pixel_shift == 0) {
+        for (i = 0; i < height; i++) {
+            *dst = *src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+                           int stride_src, int x, int y, int width, int height,
+                           int c_idx, int x_ctb, int y_ctb)
+{
+    int sh = s->sps->pixel_shift;
+    int w = s->sps->width >> s->sps->hshift[c_idx];
+    int h = s->sps->height >> s->sps->vshift[c_idx];
+
+    /* copy horizontal edges */
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
+        src, width << sh);
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
+        src + stride_src * (height - 1), width << sh);
+
+    /* copy vertical edges */
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
+
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
+}
+#endif
+
+static void restore_tqb_pixels(HEVCContext *s,
+                               uint8_t *src1, const uint8_t *dst1,
+                               ptrdiff_t stride_src, ptrdiff_t stride_dst,
+                               int x0, int y0, int width, int height, int c_idx)
 {
     if ( s->pps->transquant_bypass_enable_flag ||
             (s->sps->pcm.loop_filter_disable_flag && s->sps->pcm_enabled_flag)) {
         int x, y;
-        ptrdiff_t stride_dst = s->sao_frame->linesize[c_idx];
-        ptrdiff_t stride_src = s->frame->linesize[c_idx];
         int min_pu_size  = 1 << s->sps->log2_min_pu_size;
         int hshift       = s->sps->hshift[c_idx];
         int vshift       = s->sps->vshift[c_idx];
@@ -175,13 +227,13 @@ static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int he
         int y_min        = ((y0         ) >> s->sps->log2_min_pu_size);
         int x_max        = ((x0 + width ) >> s->sps->log2_min_pu_size);
         int y_max        = ((y0 + height) >> s->sps->log2_min_pu_size);
-        int len          = min_pu_size >> hshift;
+        int len          = (min_pu_size >> hshift) << s->sps->pixel_shift;
         for (y = y_min; y < y_max; y++) {
             for (x = x_min; x < x_max; x++) {
                 if (s->is_pcm[y * s->sps->min_pu_width + x]) {
                     int n;
-                    uint8_t *src = &s->frame->data[c_idx][    ((y << s->sps->log2_min_pu_size) >> vshift) * stride_src + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)];
-                    uint8_t *dst = &s->sao_frame->data[c_idx][((y << s->sps->log2_min_pu_size) >> vshift) * stride_dst + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)];
+                    uint8_t *src = src1 + (((y << s->sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->sps->log2_min_pu_size) - x0) >> hshift) << s->sps->pixel_shift);
+                    const uint8_t *dst = dst1 + (((y << s->sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->sps->log2_min_pu_size) - x0) >> hshift) << s->sps->pixel_shift);
                     for (n = 0; n < (min_pu_size >> vshift); n++) {
                         memcpy(src, dst, len);
                         src += stride_src;
@@ -198,6 +250,7 @@ static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int he
 static void sao_filter_CTB(HEVCContext *s, int x, int y)
 {
     static const uint8_t band_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+    HEVCLocalContext *lc = s->HEVClc;
     int c_idx;
     int edges[4];  // 0 left 1 top 2 right 3 bottom
     int x_ctb                = x >> s->sps->log2_ctb_size;
@@ -258,27 +311,132 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
         int x0       = x >> s->sps->hshift[c_idx];
         int y0       = y >> s->sps->vshift[c_idx];
         int stride_src = s->frame->linesize[c_idx];
-        int stride_dst = s->sao_frame->linesize[c_idx];
         int ctb_size_h = (1 << (s->sps->log2_ctb_size)) >> s->sps->hshift[c_idx];
         int ctb_size_v = (1 << (s->sps->log2_ctb_size)) >> s->sps->vshift[c_idx];
         int width    = FFMIN(ctb_size_h, (s->sps->width  >> s->sps->hshift[c_idx]) - x0);
         int height   = FFMIN(ctb_size_v, (s->sps->height >> s->sps->vshift[c_idx]) - y0);
         int tab      = band_tab[(FFALIGN(width, 8) >> 3) - 1];
         uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->sps->pixel_shift)];
+#if defined(USE_SAO_SMALL_BUFFER)
+        int stride_dst = ((1 << (s->sps->log2_ctb_size)) + 2) << s->sps->pixel_shift;
+        uint8_t *dst = lc->sao_pixel_buffer + (1 * stride_dst) + (1 << s->sps->pixel_shift);
+#else
+        int stride_dst = s->sao_frame->linesize[c_idx];
         uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride_dst + (x0 << s->sps->pixel_shift)];
+#endif
 
         switch (sao->type_idx[c_idx]) {
         case SAO_BAND:
             copy_CTB(dst, src, width << s->sps->pixel_shift, height, stride_dst, stride_src);
+#if defined(USE_SAO_SMALL_BUFFER)
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+#endif
             s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
                                             sao->offset_val[c_idx], sao->band_position[c_idx],
                                             width, height);
-            restore_tqb_pixels(s, x, y, width, height, c_idx);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
             sao->type_idx[c_idx] = SAO_APPLIED;
             break;
         case SAO_EDGE:
         {
-            uint8_t left_pixels = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] != SAO_APPLIED);
+#if defined(USE_SAO_SMALL_BUFFER)
+            int w = s->sps->width >> s->sps->hshift[c_idx];
+            int h = s->sps->height >> s->sps->vshift[c_idx];
+            int left_edge = edges[0];
+            int top_edge = edges[1];
+            int right_edge = edges[2];
+            int bottom_edge = edges[3];
+            int sh = s->sps->pixel_shift;
+            int left_pixels, right_pixels;
+
+            if (!top_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst - stride_dst - (left << sh);
+                src1[0] = src - stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                }
+            }
+            if (!bottom_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst + height * stride_dst - (left << sh);
+                src1[0] = src + height * stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                }
+            }
+            left_pixels = 0;
+            if (!left_edge) {
+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst - (1 << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    left_pixels = 1;
+                }
+            }
+            right_pixels = 0;
+            if (!right_edge) {
+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst + (width << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    right_pixels = 1;
+                }
+            }
+
+            copy_CTB(dst - (left_pixels << sh),
+                     src - (left_pixels << sh),
+                     (width + left_pixels + right_pixels) << sh,
+                     height, stride_dst, stride_src);
+
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+#else
+            uint8_t left_pixels;
+            /* get the CTB edge pixels from the SAO pixel buffer */
+            left_pixels = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] != SAO_APPLIED);
             if (!edges[1]) {
                 uint8_t top_left  = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED);
                 uint8_t top_right = !edges[2] && (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED);
@@ -306,6 +464,9 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
             copy_CTB(dst - (left_pixels << s->sps->pixel_shift),
                      src - (left_pixels << s->sps->pixel_shift),
                      (width + 1 + left_pixels) << s->sps->pixel_shift, height, stride_dst, stride_src);
+#endif
+            /* XXX: could handle the restoration here to simplify the
+               DSP functions */
             s->hevcdsp.sao_edge_filter[restore](src, dst,
                                                 stride_src, stride_dst,
                                                 sao,
@@ -314,7 +475,8 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
                                                 vert_edge,
                                                 horiz_edge,
                                                 diag_edge);
-            restore_tqb_pixels(s, x, y, width, height, c_idx);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
             sao->type_idx[c_idx] = SAO_APPLIED;
             break;
         }