[FFmpeg-devel] [PATCH] yadif: restore speed of the C filtering code
James Darnley
james.darnley at gmail.com
Fri Mar 1 18:20:19 CET 2013
Always use the special filter for the first and last 3 columns (only).
The changes made in 64ed397 slowed the filter to just under 3/4 of what
it was. This commit restores almost all of that speed while maintaining
identical output.
For reference, on my Athlon64:
1733222 decicycles in old
2358563 decicycles in new
1740014 decicycles in this
---
libavfilter/vf_yadif.c | 93 +++++++++++++++++++++++---------------
libavfilter/x86/vf_yadif_init.c | 12 +----
libavfilter/yadif.h | 4 +-
3 files changed, 60 insertions(+), 49 deletions(-)
diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index b7c2d80..3bd0d17 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -34,9 +34,9 @@
#define PERM_RWP AV_PERM_WRITE | AV_PERM_PRESERVE | AV_PERM_REUSE
#define CHECK(j)\
- { int score = FFABS(cur[mrefs + off_left + (j)] - cur[prefs + off_left - (j)])\
+ { int score = FFABS(cur[mrefs - 1 + (j)] - cur[prefs - 1 - (j)])\
+ FFABS(cur[mrefs +(j)] - cur[prefs -(j)])\
- + FFABS(cur[mrefs + off_right + (j)] - cur[prefs + off_right - (j)]);\
+ + FFABS(cur[mrefs + 1 + (j)] - cur[prefs + 1 - (j)]);\
if (score < spatial_score) {\
spatial_score= score;\
spatial_pred= (cur[mrefs +(j)] + cur[prefs -(j)])>>1;\
@@ -51,15 +51,46 @@
int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e) )>>1; \
int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
int spatial_pred = (c+e) >> 1; \
- int off_right = (x < w - 1) ? 1 : -1;\
- int off_left = x ? -1 : 1;\
- int spatial_score = FFABS(cur[mrefs + off_left] - cur[prefs + off_left]) + FFABS(c-e) \
- + FFABS(cur[mrefs + off_right] - cur[prefs + off_right]) - 1; \
+ int spatial_score = FFABS(cur[mrefs - 1] - cur[prefs - 1]) + FFABS(c-e) \
+ + FFABS(cur[mrefs + 1] - cur[prefs + 1]) - 1; \
\
- if (x > 2 && x < w - 3) {\
- CHECK(-1) CHECK(-2) }} }} \
- CHECK( 1) CHECK( 2) }} }} \
- }\
+ CHECK(-1) CHECK(-2) }} }} \
+ CHECK( 1) CHECK( 2) }} }} \
+ \
+ if (mode < 2) { \
+ int b = (prev2[2 * mrefs] + next2[2 * mrefs])>>1; \
+ int f = (prev2[2 * prefs] + next2[2 * prefs])>>1; \
+ int max = FFMAX3(d - e, d - c, FFMIN(b - c, f - e)); \
+ int min = FFMIN3(d - e, d - c, FFMAX(b - c, f - e)); \
+ \
+ diff = FFMAX3(diff, min, -max); \
+ } \
+ \
+ if (spatial_pred > d + diff) \
+ spatial_pred = d + diff; \
+ else if (spatial_pred < d - diff) \
+ spatial_pred = d - diff; \
+ \
+ dst[0] = spatial_pred; \
+ \
+ dst++; \
+ cur++; \
+ prev++; \
+ next++; \
+ prev2++; \
+ next2++; \
+ }
+
+#define FILTER_EDGES(start, end) \
+ for (x = start; x < end; x++) { \
+ int c = cur[mrefs]; \
+ int d = (prev2[0] + next2[0])>>1; \
+ int e = cur[prefs]; \
+ int temporal_diff0 = FFABS(prev2[0] - next2[0]); \
+ int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e) )>>1; \
+ int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e) )>>1; \
+ int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
+ int spatial_pred = (c+e) >> 1; \
\
if (mode < 2) { \
int b = (prev2[2 * mrefs] + next2[2 * mrefs])>>1; \
@@ -101,8 +132,7 @@ static void filter_line_c(void *dst1,
}
static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
- int w, int prefs, int mrefs, int parity, int mode,
- int l_edge)
+ int w, int prefs, int mrefs, int parity, int mode)
{
uint8_t *dst = dst1;
uint8_t *prev = prev1;
@@ -112,7 +142,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
uint8_t *prev2 = parity ? prev : cur ;
uint8_t *next2 = parity ? cur : next;
- FILTER(0, l_edge)
+ FILTER_EDGES(0, 3)
dst = (uint8_t*)dst1 + w - 3;
prev = (uint8_t*)prev1 + w - 3;
@@ -121,7 +151,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1,
prev2 = (uint8_t*)(parity ? prev : cur);
next2 = (uint8_t*)(parity ? cur : next);
- FILTER(w - 3, w)
+ FILTER_EDGES(w - 3, w)
}
@@ -144,8 +174,7 @@ static void filter_line_c_16bit(void *dst1,
}
static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
- int w, int prefs, int mrefs, int parity, int mode,
- int l_edge)
+ int w, int prefs, int mrefs, int parity, int mode)
{
uint16_t *dst = dst1;
uint16_t *prev = prev1;
@@ -155,7 +184,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
uint16_t *prev2 = parity ? prev : cur ;
uint16_t *next2 = parity ? cur : next;
- FILTER(0, l_edge)
+ FILTER_EDGES(0, 3)
dst = (uint16_t*)dst1 + w - 3;
prev = (uint16_t*)prev1 + w - 3;
@@ -164,7 +193,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1,
prev2 = (uint16_t*)(parity ? prev : cur);
next2 = (uint16_t*)(parity ? cur : next);
- FILTER(w - 3, w)
+ FILTER_EDGES(w - 3, w)
}
static void filter(AVFilterContext *ctx, AVFilterBufferRef *dstpic,
@@ -178,7 +207,7 @@ static void filter(AVFilterContext *ctx, AVFilterBufferRef *dstpic,
int h = dstpic->video->h;
int refs = yadif->cur->linesize[i];
int df = (yadif->csp->comp[i].depth_minus1 + 8) / 8;
- int l_edge, l_edge_pix;
+ int pix_3 = 3 * df;
if (i == 1 || i == 2) {
/* Why is this not part of the per-plane description thing? */
@@ -189,8 +218,6 @@ static void filter(AVFilterContext *ctx, AVFilterBufferRef *dstpic,
/* filtering reads 3 pixels to the left/right; to avoid invalid reads,
* we need to call the c variant which avoids this for border pixels
*/
- l_edge = yadif->req_align;
- l_edge_pix = l_edge / df;
for (y = 0; y < h; y++) {
if ((y ^ parity) & 1) {
@@ -199,22 +226,14 @@ static void filter(AVFilterContext *ctx, AVFilterBufferRef *dstpic,
uint8_t *next = &yadif->next->data[i][y * refs];
uint8_t *dst = &dstpic->data[i][y * dstpic->linesize[i]];
int mode = y == 1 || y + 2 == h ? 2 : yadif->mode;
- if (yadif->req_align) {
- yadif->filter_line(dst + l_edge, prev + l_edge, cur + l_edge,
- next + l_edge, w - l_edge_pix - 3,
- y + 1 < h ? refs : -refs,
- y ? -refs : refs,
- parity ^ tff, mode);
- yadif->filter_edges(dst, prev, cur, next, w,
- y + 1 < h ? refs : -refs,
- y ? -refs : refs,
- parity ^ tff, mode, l_edge_pix);
- } else {
- yadif->filter_line(dst, prev, cur, next + l_edge, w,
- y + 1 < h ? refs : -refs,
- y ? -refs : refs,
- parity ^ tff, mode);
- }
+ yadif->filter_line(dst + pix_3, prev + pix_3, cur + pix_3, next + pix_3, w - 6,
+ y + 1 < h ? refs : -refs,
+ y ? -refs : refs,
+ parity ^ tff, mode);
+ yadif->filter_edges(dst, prev, cur, next, w,
+ y + 1 < h ? refs : -refs,
+ y ? -refs : refs,
+ parity ^ tff, mode);
} else {
memcpy(&dstpic->data[i][y * dstpic->linesize[i]],
&yadif->cur->data[i][y * refs], w * df);
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 2873744..8d5e768 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -42,18 +42,12 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
#if HAVE_YASM
#if ARCH_X86_32
- if (EXTERNAL_MMXEXT(cpu_flags)) {
+ if (EXTERNAL_MMXEXT(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_mmxext;
- yadif->req_align = 8;
- }
#endif /* ARCH_X86_32 */
- if (EXTERNAL_SSE2(cpu_flags)) {
+ if (EXTERNAL_SSE2(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_sse2;
- yadif->req_align = 16;
- }
- if (EXTERNAL_SSSE3(cpu_flags)) {
+ if (EXTERNAL_SSSE3(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_ssse3;
- yadif->req_align = 16;
- }
#endif /* HAVE_YASM */
}
diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
index 50fc856..2c3f125 100644
--- a/libavfilter/yadif.h
+++ b/libavfilter/yadif.h
@@ -57,13 +57,11 @@ typedef struct YADIFContext {
/**
* Required alignment for filter_line
*/
- int req_align;
void (*filter_line)(void *dst,
void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int parity, int mode);
void (*filter_edges)(void *dst, void *prev, void *cur, void *next,
- int w, int prefs, int mrefs, int parity, int mode,
- int l_edge);
+ int w, int prefs, int mrefs, int parity, int mode);
const AVPixFmtDescriptor *csp;
int eof;
--
1.7.9
More information about the ffmpeg-devel
mailing list