[FFmpeg-devel] [PATCH] yadif: restore speed of the C filtering code
Paul B Mahol
onemda at gmail.com
Fri Mar 1 18:31:45 CET 2013
On 3/1/13, James Darnley <james.darnley at gmail.com> wrote:
> Always use the special filter for the first and last 3 columns (only).
>
> The changes made in 64ed397 slowed the filter to just under 3/4 of what
> it was. This commit restores almost all of that speed while maintaining
> identical output.
>
> For reference, on my Athlon64:
> 1733222 decicycles in old
> 2358563 decicycles in new
> 1740014 decicycles in this
> ---
> libavfilter/vf_yadif.c | 93
> +++++++++++++++++++++++---------------
> libavfilter/x86/vf_yadif_init.c | 12 +----
> libavfilter/yadif.h | 4 +-
> 3 files changed, 60 insertions(+), 49 deletions(-)
That commit claimed it fixed overreads.
Have you checked that no overreads happens with this patch?
>
> diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
> index b7c2d80..3bd0d17 100644
> --- a/libavfilter/vf_yadif.c
> +++ b/libavfilter/vf_yadif.c
> @@ -34,9 +34,9 @@
> #define PERM_RWP AV_PERM_WRITE | AV_PERM_PRESERVE | AV_PERM_REUSE
>
> #define CHECK(j)\
> - { int score = FFABS(cur[mrefs + off_left + (j)] - cur[prefs +
> off_left - (j)])\
> + { int score = FFABS(cur[mrefs - 1 + (j)] - cur[prefs - 1 - (j)])\
> + FFABS(cur[mrefs +(j)] - cur[prefs -(j)])\
> - + FFABS(cur[mrefs + off_right + (j)] - cur[prefs +
> off_right - (j)]);\
> + + FFABS(cur[mrefs + 1 + (j)] - cur[prefs + 1 - (j)]);\
> if (score < spatial_score) {\
> spatial_score= score;\
> spatial_pred= (cur[mrefs +(j)] + cur[prefs -(j)])>>1;\
> @@ -51,15 +51,46 @@
> int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] -
> e) )>>1; \
> int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1,
> temporal_diff2); \
> int spatial_pred = (c+e) >> 1; \
> - int off_right = (x < w - 1) ? 1 : -1;\
> - int off_left = x ? -1 : 1;\
> - int spatial_score = FFABS(cur[mrefs + off_left] - cur[prefs +
> off_left]) + FFABS(c-e) \
> - + FFABS(cur[mrefs + off_right] - cur[prefs +
> off_right]) - 1; \
> + int spatial_score = FFABS(cur[mrefs - 1] - cur[prefs - 1]) +
> FFABS(c-e) \
> + + FFABS(cur[mrefs + 1] - cur[prefs + 1]) - 1; \
> \
> - if (x > 2 && x < w - 3) {\
> - CHECK(-1) CHECK(-2) }} }} \
> - CHECK( 1) CHECK( 2) }} }} \
> - }\
> + CHECK(-1) CHECK(-2) }} }} \
> + CHECK( 1) CHECK( 2) }} }} \
> + \
> + if (mode < 2) { \
> + int b = (prev2[2 * mrefs] + next2[2 * mrefs])>>1; \
> + int f = (prev2[2 * prefs] + next2[2 * prefs])>>1; \
> + int max = FFMAX3(d - e, d - c, FFMIN(b - c, f - e)); \
> + int min = FFMIN3(d - e, d - c, FFMAX(b - c, f - e)); \
> + \
> + diff = FFMAX3(diff, min, -max); \
> + } \
> + \
> + if (spatial_pred > d + diff) \
> + spatial_pred = d + diff; \
> + else if (spatial_pred < d - diff) \
> + spatial_pred = d - diff; \
> + \
> + dst[0] = spatial_pred; \
> + \
> + dst++; \
> + cur++; \
> + prev++; \
> + next++; \
> + prev2++; \
> + next2++; \
> + }
> +
> +#define FILTER_EDGES(start, end) \
> + for (x = start; x < end; x++) { \
> + int c = cur[mrefs]; \
> + int d = (prev2[0] + next2[0])>>1; \
> + int e = cur[prefs]; \
> + int temporal_diff0 = FFABS(prev2[0] - next2[0]); \
> + int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] -
> e) )>>1; \
> + int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] -
> e) )>>1; \
> + int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1,
> temporal_diff2); \
> + int spatial_pred = (c+e) >> 1; \
> \
> if (mode < 2) { \
> int b = (prev2[2 * mrefs] + next2[2 * mrefs])>>1; \
> @@ -101,8 +132,7 @@ static void filter_line_c(void *dst1,
> }
>
> static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
> - int w, int prefs, int mrefs, int parity, int
> mode,
> - int l_edge)
> + int w, int prefs, int mrefs, int parity, int
> mode)
> {
> uint8_t *dst = dst1;
> uint8_t *prev = prev1;
> @@ -112,7 +142,7 @@ static void filter_edges(void *dst1, void *prev1, void
> *cur1, void *next1,
> uint8_t *prev2 = parity ? prev : cur ;
> uint8_t *next2 = parity ? cur : next;
>
> - FILTER(0, l_edge)
> + FILTER_EDGES(0, 3)
>
> dst = (uint8_t*)dst1 + w - 3;
> prev = (uint8_t*)prev1 + w - 3;
> @@ -121,7 +151,7 @@ static void filter_edges(void *dst1, void *prev1, void
> *cur1, void *next1,
> prev2 = (uint8_t*)(parity ? prev : cur);
> next2 = (uint8_t*)(parity ? cur : next);
>
> - FILTER(w - 3, w)
> + FILTER_EDGES(w - 3, w)
> }
>
>
> @@ -144,8 +174,7 @@ static void filter_line_c_16bit(void *dst1,
> }
>
> static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void
> *next1,
> - int w, int prefs, int mrefs, int parity, int
> mode,
> - int l_edge)
> + int w, int prefs, int mrefs, int parity, int
> mode)
> {
> uint16_t *dst = dst1;
> uint16_t *prev = prev1;
> @@ -155,7 +184,7 @@ static void filter_edges_16bit(void *dst1, void *prev1,
> void *cur1, void *next1,
> uint16_t *prev2 = parity ? prev : cur ;
> uint16_t *next2 = parity ? cur : next;
>
> - FILTER(0, l_edge)
> + FILTER_EDGES(0, 3)
>
> dst = (uint16_t*)dst1 + w - 3;
> prev = (uint16_t*)prev1 + w - 3;
> @@ -164,7 +193,7 @@ static void filter_edges_16bit(void *dst1, void *prev1,
> void *cur1, void *next1,
> prev2 = (uint16_t*)(parity ? prev : cur);
> next2 = (uint16_t*)(parity ? cur : next);
>
> - FILTER(w - 3, w)
> + FILTER_EDGES(w - 3, w)
> }
>
> static void filter(AVFilterContext *ctx, AVFilterBufferRef *dstpic,
> @@ -178,7 +207,7 @@ static void filter(AVFilterContext *ctx,
> AVFilterBufferRef *dstpic,
> int h = dstpic->video->h;
> int refs = yadif->cur->linesize[i];
> int df = (yadif->csp->comp[i].depth_minus1 + 8) / 8;
> - int l_edge, l_edge_pix;
> + int pix_3 = 3 * df;
>
> if (i == 1 || i == 2) {
> /* Why is this not part of the per-plane description thing? */
> @@ -189,8 +218,6 @@ static void filter(AVFilterContext *ctx,
> AVFilterBufferRef *dstpic,
> /* filtering reads 3 pixels to the left/right; to avoid invalid
> reads,
> * we need to call the c variant which avoids this for border
> pixels
> */
> - l_edge = yadif->req_align;
> - l_edge_pix = l_edge / df;
>
> for (y = 0; y < h; y++) {
> if ((y ^ parity) & 1) {
> @@ -199,22 +226,14 @@ static void filter(AVFilterContext *ctx,
> AVFilterBufferRef *dstpic,
> uint8_t *next = &yadif->next->data[i][y * refs];
> uint8_t *dst = &dstpic->data[i][y * dstpic->linesize[i]];
> int mode = y == 1 || y + 2 == h ? 2 : yadif->mode;
> - if (yadif->req_align) {
> - yadif->filter_line(dst + l_edge, prev + l_edge, cur +
> l_edge,
> - next + l_edge, w - l_edge_pix - 3,
> - y + 1 < h ? refs : -refs,
> - y ? -refs : refs,
> - parity ^ tff, mode);
> - yadif->filter_edges(dst, prev, cur, next, w,
> - y + 1 < h ? refs : -refs,
> - y ? -refs : refs,
> - parity ^ tff, mode, l_edge_pix);
> - } else {
> - yadif->filter_line(dst, prev, cur, next + l_edge, w,
> - y + 1 < h ? refs : -refs,
> - y ? -refs : refs,
> - parity ^ tff, mode);
> - }
> + yadif->filter_line(dst + pix_3, prev + pix_3, cur + pix_3,
> next + pix_3, w - 6,
> + y + 1 < h ? refs : -refs,
> + y ? -refs : refs,
> + parity ^ tff, mode);
> + yadif->filter_edges(dst, prev, cur, next, w,
> + y + 1 < h ? refs : -refs,
> + y ? -refs : refs,
> + parity ^ tff, mode);
> } else {
> memcpy(&dstpic->data[i][y * dstpic->linesize[i]],
> &yadif->cur->data[i][y * refs], w * df);
> diff --git a/libavfilter/x86/vf_yadif_init.c
> b/libavfilter/x86/vf_yadif_init.c
> index 2873744..8d5e768 100644
> --- a/libavfilter/x86/vf_yadif_init.c
> +++ b/libavfilter/x86/vf_yadif_init.c
> @@ -42,18 +42,12 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
>
> #if HAVE_YASM
> #if ARCH_X86_32
> - if (EXTERNAL_MMXEXT(cpu_flags)) {
> + if (EXTERNAL_MMXEXT(cpu_flags))
> yadif->filter_line = ff_yadif_filter_line_mmxext;
> - yadif->req_align = 8;
> - }
> #endif /* ARCH_X86_32 */
> - if (EXTERNAL_SSE2(cpu_flags)) {
> + if (EXTERNAL_SSE2(cpu_flags))
> yadif->filter_line = ff_yadif_filter_line_sse2;
> - yadif->req_align = 16;
> - }
> - if (EXTERNAL_SSSE3(cpu_flags)) {
> + if (EXTERNAL_SSSE3(cpu_flags))
> yadif->filter_line = ff_yadif_filter_line_ssse3;
> - yadif->req_align = 16;
> - }
> #endif /* HAVE_YASM */
> }
> diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
> index 50fc856..2c3f125 100644
> --- a/libavfilter/yadif.h
> +++ b/libavfilter/yadif.h
> @@ -57,13 +57,11 @@ typedef struct YADIFContext {
> /**
> * Required alignment for filter_line
> */
> - int req_align;
> void (*filter_line)(void *dst,
> void *prev, void *cur, void *next,
> int w, int prefs, int mrefs, int parity, int
> mode);
> void (*filter_edges)(void *dst, void *prev, void *cur, void *next,
> - int w, int prefs, int mrefs, int parity, int
> mode,
> - int l_edge);
> + int w, int prefs, int mrefs, int parity, int
> mode);
>
> const AVPixFmtDescriptor *csp;
> int eof;
> --
> 1.7.9
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
More information about the ffmpeg-devel
mailing list