[FFmpeg-cvslog] yadif: restore speed of the C filtering code
    James Darnley 
    git at videolan.org
       
    Wed May 15 09:47:46 CEST 2013
    
    
  
ffmpeg | branch: master | James Darnley <james.darnley at gmail.com> | Sun Mar 10 15:08:50 2013 +0100| [b0ef0ae77608a5e3d2ba68af503e8b1277a215d3] | committer: Anton Khirnov
yadif: restore speed of the C filtering code
Always use the special filter for the first and last 3 columns (only).
Changes made in 64ed397 slowed the filter to just under 3/4 of what it
was.  This commit restores the speed while maintaining identical output.
For reference, on my Athlon64:
1733222 decicycles in old
2358563 decicycles in new
1727558 decicycles in this
Signed-off-by: Anton Khirnov <anton at khirnov.net>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=b0ef0ae77608a5e3d2ba68af503e8b1277a215d3
---
 libavfilter/vf_yadif.c          |   70 ++++++++++++++++++---------------------
 libavfilter/x86/vf_yadif_init.c |   12 ++-----
 libavfilter/yadif.h             |    4 +--
 3 files changed, 37 insertions(+), 49 deletions(-)
diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index faa487f..076ad41 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -33,14 +33,17 @@
 #include <assert.h>
 
 #define CHECK(j)\
-    {   int score = FFABS(cur[mrefs + off_left + (j)] - cur[prefs + off_left - (j)])\
+    {   int score = FFABS(cur[mrefs - 1 + (j)] - cur[prefs - 1 - (j)])\
                   + FFABS(cur[mrefs  +(j)] - cur[prefs  -(j)])\
-                  + FFABS(cur[mrefs + off_right + (j)] - cur[prefs + off_right - (j)]);\
+                  + FFABS(cur[mrefs + 1 + (j)] - cur[prefs + 1 - (j)]);\
         if (score < spatial_score) {\
             spatial_score= score;\
             spatial_pred= (cur[mrefs  +(j)] + cur[prefs  -(j)])>>1;\
 
-#define FILTER(start, end) \
+/* The is_not_edge argument here controls when the code will enter a branch
+ * which reads up to and including x-3 and x+3. */
+
+#define FILTER(start, end, is_not_edge) \
     for (x = start;  x < end; x++) { \
         int c = cur[mrefs]; \
         int d = (prev2[0] + next2[0])>>1; \
@@ -50,12 +53,10 @@
         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e) )>>1; \
         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
         int spatial_pred = (c+e) >> 1; \
-        int off_right = (x < w - 1) ? 1 : -1;\
-        int off_left  = x ? -1 : 1;\
-        int spatial_score = FFABS(cur[mrefs + off_left]  - cur[prefs + off_left]) + FFABS(c-e) \
-                          + FFABS(cur[mrefs + off_right] - cur[prefs + off_right]) - 1; \
  \
-        if (x > 2 && x < w - 3) {\
+        if (is_not_edge) {\
+            int spatial_score = FFABS(cur[mrefs - 1] - cur[prefs - 1]) + FFABS(c-e) \
+                              + FFABS(cur[mrefs + 1] - cur[prefs + 1]) - 1; \
             CHECK(-1) CHECK(-2) }} }} \
             CHECK( 1) CHECK( 2) }} }} \
         }\
@@ -96,12 +97,15 @@ static void filter_line_c(void *dst1,
     uint8_t *prev2 = parity ? prev : cur ;
     uint8_t *next2 = parity ? cur  : next;
 
-    FILTER(0, w)
+    /* The function is called with the pointers already pointing to data[3] and
+     * with 6 subtracted from the width.  This allows the FILTER macro to be
+     * called so that it processes all the pixels normally.  A constant value of
+     * true for is_not_edge lets the compiler ignore the if statement. */
+    FILTER(0, w, 1)
 }
 
 static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
-                         int w, int prefs, int mrefs, int parity, int mode,
-                         int l_edge)
+                         int w, int prefs, int mrefs, int parity, int mode)
 {
     uint8_t *dst  = dst1;
     uint8_t *prev = prev1;
@@ -111,7 +115,9 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
     uint8_t *prev2 = parity ? prev : cur ;
     uint8_t *next2 = parity ? cur  : next;
 
-    FILTER(0, l_edge)
+    /* Only edge pixels need to be processed here.  A constant value of false
+     * for is_not_edge should let the compiler ignore the whole branch. */
+    FILTER(0, 3, 0)
 
     dst  = (uint8_t*)dst1  + w - 3;
     prev = (uint8_t*)prev1 + w - 3;
@@ -120,7 +126,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
     prev2 = (uint8_t*)(parity ? prev : cur);
     next2 = (uint8_t*)(parity ? cur  : next);
 
-    FILTER(w - 3, w)
+    FILTER(w - 3, w, 0)
 }
 
 
@@ -139,12 +145,11 @@ static void filter_line_c_16bit(void *dst1,
     mrefs /= 2;
     prefs /= 2;
 
-    FILTER(0, w)
+    FILTER(0, w, 1)
 }
 
 static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
-                               int w, int prefs, int mrefs, int parity, int mode,
-                               int l_edge)
+                               int w, int prefs, int mrefs, int parity, int mode)
 {
     uint16_t *dst  = dst1;
     uint16_t *prev = prev1;
@@ -154,7 +159,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
     uint16_t *prev2 = parity ? prev : cur ;
     uint16_t *next2 = parity ? cur  : next;
 
-    FILTER(0, l_edge)
+    FILTER(0, 3, 0)
 
     dst   = (uint16_t*)dst1  + w - 3;
     prev  = (uint16_t*)prev1 + w - 3;
@@ -163,7 +168,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
     prev2 = (uint16_t*)(parity ? prev : cur);
     next2 = (uint16_t*)(parity ? cur  : next);
 
-    FILTER(w - 3, w)
+    FILTER(w - 3, w, 0)
 }
 
 static void filter(AVFilterContext *ctx, AVFrame *dstpic,
@@ -177,7 +182,7 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
         int h = dstpic->height;
         int refs = yadif->cur->linesize[i];
         int df = (yadif->csp->comp[i].depth_minus1 + 8) / 8;
-        int l_edge, l_edge_pix;
+        int pix_3 = 3 * df;
 
         if (i == 1 || i == 2) {
         /* Why is this not part of the per-plane description thing? */
@@ -188,8 +193,6 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
         /* filtering reads 3 pixels to the left/right; to avoid invalid reads,
          * we need to call the c variant which avoids this for border pixels
          */
-        l_edge     = yadif->req_align;
-        l_edge_pix = l_edge / df;
 
         for (y = 0; y < h; y++) {
             if ((y ^ parity) & 1) {
@@ -198,22 +201,15 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
                 uint8_t *next = &yadif->next->data[i][y * refs];
                 uint8_t *dst  = &dstpic->data[i][y * dstpic->linesize[i]];
                 int     mode  = y == 1 || y + 2 == h ? 2 : yadif->mode;
-                if (yadif->req_align) {
-                    yadif->filter_line(dst + l_edge, prev + l_edge, cur + l_edge,
-                                       next + l_edge, w - l_edge_pix - 3,
-                                       y + 1 < h ? refs : -refs,
-                                       y ? -refs : refs,
-                                       parity ^ tff, mode);
-                    yadif->filter_edges(dst, prev, cur, next, w,
-                                         y + 1 < h ? refs : -refs,
-                                         y ? -refs : refs,
-                                         parity ^ tff, mode, l_edge_pix);
-                } else {
-                    yadif->filter_line(dst, prev, cur, next + l_edge, w,
-                                       y + 1 < h ? refs : -refs,
-                                       y ? -refs : refs,
-                                       parity ^ tff, mode);
-                }
+                yadif->filter_line(dst + pix_3, prev + pix_3, cur + pix_3,
+                                   next + pix_3, w - 6,
+                                   y + 1 < h ? refs : -refs,
+                                   y ? -refs : refs,
+                                   parity ^ tff, mode);
+                yadif->filter_edges(dst, prev, cur, next, w,
+                                    y + 1 < h ? refs : -refs,
+                                    y ? -refs : refs,
+                                    parity ^ tff, mode);
             } else {
                 memcpy(&dstpic->data[i][y * dstpic->linesize[i]],
                        &yadif->cur->data[i][y * refs], w * df);
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 99520a2..5978a4f 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -42,18 +42,12 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
     int cpu_flags = av_get_cpu_flags();
 
 #if ARCH_X86_32
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
+    if (EXTERNAL_MMXEXT(cpu_flags))
         yadif->filter_line = ff_yadif_filter_line_mmxext;
-        yadif->req_align   = 8;
-    }
 #endif /* ARCH_X86_32 */
-    if (EXTERNAL_SSE2(cpu_flags)) {
+    if (EXTERNAL_SSE2(cpu_flags))
         yadif->filter_line = ff_yadif_filter_line_sse2;
-        yadif->req_align   = 16;
-    }
-    if (EXTERNAL_SSSE3(cpu_flags)) {
+    if (EXTERNAL_SSSE3(cpu_flags))
         yadif->filter_line = ff_yadif_filter_line_ssse3;
-        yadif->req_align   = 16;
-    }
 #endif /* HAVE_YASM */
 }
diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
index 665922d..6936723 100644
--- a/libavfilter/yadif.h
+++ b/libavfilter/yadif.h
@@ -55,13 +55,11 @@ typedef struct YADIFContext {
     /**
      * Required alignment for filter_line
      */
-    int req_align;
     void (*filter_line)(void *dst,
                         void *prev, void *cur, void *next,
                         int w, int prefs, int mrefs, int parity, int mode);
     void (*filter_edges)(void *dst, void *prev, void *cur, void *next,
-                         int w, int prefs, int mrefs, int parity, int mode,
-                         int l_edge);
+                         int w, int prefs, int mrefs, int parity, int mode);
 
     const AVPixFmtDescriptor *csp;
     int eof;
    
    
More information about the ffmpeg-cvslog
mailing list