[FFmpeg-devel] [PATCH] avfilter/dctdnoiz: rewrite [f/i]dct
Michael Niedermayer
michaelni at gmx.at
Mon Aug 4 00:44:48 CEST 2014
On Sun, Aug 03, 2014 at 10:27:21PM +0200, Clément Bœsch wrote:
> This removes the avcodec dependency and make the code almost twice as
> fast. More to come.
>
> The DCT factorization is based on "Fast and numerically stable
> algorithms for discrete cosine transforms" from Gerlind Plonkaa &
> Manfred Tasche (DOI: 10.1016/j.laa.2004.07.015).
> ---
> configure | 2 -
> libavfilter/vf_dctdnoiz.c | 328 +++++++++++++++++++++++++++++++++-------------
> 2 files changed, 240 insertions(+), 90 deletions(-)
>
> diff --git a/configure b/configure
> index 9c3af50..6196b2a 100755
> --- a/configure
> +++ b/configure
> @@ -2526,8 +2526,6 @@ boxblur_filter_deps="gpl"
> bs2b_filter_deps="libbs2b"
> colormatrix_filter_deps="gpl"
> cropdetect_filter_deps="gpl"
> -dctdnoiz_filter_deps="avcodec"
> -dctdnoiz_filter_select="dct"
> delogo_filter_deps="gpl"
> deshake_filter_deps="avcodec"
> deshake_filter_select="me_cmp"
> diff --git a/libavfilter/vf_dctdnoiz.c b/libavfilter/vf_dctdnoiz.c
> index 71b8536..6d24934 100644
> --- a/libavfilter/vf_dctdnoiz.c
> +++ b/libavfilter/vf_dctdnoiz.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2013 Clément Bœsch
> + * Copyright (c) 2013-2014 Clément Bœsch
> *
> * This file is part of FFmpeg.
> *
> @@ -23,7 +23,6 @@
> * @see http://www.ipol.im/pub/art/2011/ys-dct/
> */
>
> -#include "libavcodec/avfft.h"
> #include "libavutil/eval.h"
> #include "libavutil/opt.h"
> #include "drawutils.h"
> @@ -35,7 +34,7 @@
> static const char *const var_names[] = { "c", NULL };
> enum { VAR_C, VAR_VARS_NB };
>
> -typedef struct {
> +typedef struct DCTdnoizContext {
> const AVClass *class;
>
> /* coefficient factor expression */
> @@ -52,8 +51,9 @@ typedef struct {
> int p_linesize; // line sizes for color and weights
> int overlap; // number of block overlapping pixels
> int step; // block step increment (BSIZE - overlap)
> - DCTContext *dct, *idct; // DCT and inverse DCT contexts
> - float *block, *tmp_block; // two BSIZE x BSIZE block buffers
> + void (*filter_freq_func)(struct DCTdnoizContext *s,
> + const float *src, int src_linesize,
> + float *dst, int dst_linesize);
> } DCTdnoizContext;
>
> #define OFFSET(x) offsetof(DCTdnoizContext, x)
> @@ -69,66 +69,245 @@ static const AVOption dctdnoiz_options[] = {
>
> AVFILTER_DEFINE_CLASS(dctdnoiz);
>
> -static float *dct_block(DCTdnoizContext *ctx, const float *src, int src_linesize)
> +static void av_always_inline fdct16_1d(float *dst, const float *src,
> + int dst_stridea, int dst_strideb,
> + int src_stridea, int src_strideb)
> {
> - int x, y;
> - float *column;
> -
> - for (y = 0; y < BSIZE; y++) {
> - float *line = ctx->block;
> + int i;
>
> - memcpy(line, src, BSIZE * sizeof(*line));
> - src += src_linesize;
> - av_dct_calc(ctx->dct, line);
> -
> - column = ctx->tmp_block + y;
> - column[0] = line[0] * (1. / sqrt(BSIZE));
> - column += BSIZE;
> - for (x = 1; x < BSIZE; x++) {
> - *column = line[x] * sqrt(2. / BSIZE);
> - column += BSIZE;
> - }
> + for (i = 0; i < BSIZE; i++) {
> + const float x0_0 = src[ 0*src_stridea] + src[15*src_stridea];
> + const float x0_1 = src[ 1*src_stridea] + src[14*src_stridea];
> + const float x0_2 = src[ 2*src_stridea] + src[13*src_stridea];
> + const float x0_3 = src[ 3*src_stridea] + src[12*src_stridea];
> + const float x0_4 = src[ 4*src_stridea] + src[11*src_stridea];
> + const float x0_5 = src[ 5*src_stridea] + src[10*src_stridea];
> + const float x0_6 = src[ 6*src_stridea] + src[ 9*src_stridea];
> + const float x0_7 = src[ 7*src_stridea] + src[ 8*src_stridea];
> + const float x0_8 = src[ 0*src_stridea] - src[15*src_stridea];
> + const float x0_9 = src[ 1*src_stridea] - src[14*src_stridea];
> + const float x0_a = src[ 2*src_stridea] - src[13*src_stridea];
> + const float x0_b = src[ 3*src_stridea] - src[12*src_stridea];
> + const float x0_c = src[ 4*src_stridea] - src[11*src_stridea];
> + const float x0_d = src[ 5*src_stridea] - src[10*src_stridea];
> + const float x0_e = src[ 6*src_stridea] - src[ 9*src_stridea];
> + const float x0_f = src[ 7*src_stridea] - src[ 8*src_stridea];
> + const float x2_0 = x0_0 + x0_7;
> + const float x2_1 = x0_1 + x0_6;
> + const float x2_2 = x0_2 + x0_5;
> + const float x2_3 = x0_3 + x0_4;
> + const float x2_4 = x0_0 - x0_7;
> + const float x2_5 = x0_1 - x0_6;
> + const float x2_6 = x0_2 - x0_5;
> + const float x2_7 = x0_3 - x0_4;
> + const float x4_0 = x2_0 + x2_3;
> + const float x4_1 = x2_1 + x2_2;
> + const float x4_2 = x2_0 - x2_3;
> + const float x4_3 = x2_1 - x2_2;
> + const float x5_0 = x4_0 + x4_1;
> + const float x5_1 = x4_0 - x4_1;
> + const float x5_2 = 1.306562964876380*x4_2 + 0.541196100146197*x4_3;
> + const float x5_3 = 0.541196100146197*x4_2 - 1.306562964876380*x4_3;
> + const float x6_0 = 1.387039845322150*x2_4 + 0.275899379282943*x2_7;
> + const float x6_1 = 1.175875602419360*x2_5 + 0.785694958387102*x2_6;
> + const float x6_2 = -0.785694958387102*x2_5 + 1.175875602419360*x2_6;
> + const float x6_3 = 0.275899379282943*x2_4 - 1.387039845322150*x2_7;
> + const float x7_0 = x6_0 + x6_1;
> + const float x7_1 = x6_0 - x6_1;
> + const float x7_2 = x6_2 + x6_3;
> + const float x7_3 = x6_2 - x6_3;
> + const float x3_5 = 0.707106781186547*x7_1 - 0.707106781186547*x7_3;
> + const float x3_6 = 0.707106781186547*x7_1 + 0.707106781186547*x7_3;
> + const float x8_0 = 1.407403737526380*x0_8 + 0.138617169199091*x0_f;
> + const float x8_1 = 1.353318001174350*x0_9 + 0.410524527522357*x0_e;
> + const float x8_2 = 1.247225012986670*x0_a + 0.666655658477747*x0_d;
> + const float x8_3 = 1.093201867001760*x0_b + 0.897167586342636*x0_c;
> + const float x8_4 = -0.897167586342636*x0_b + 1.093201867001760*x0_c;
> + const float x8_5 = 0.666655658477747*x0_a - 1.247225012986670*x0_d;
> + const float x8_6 = -0.410524527522357*x0_9 + 1.353318001174350*x0_e;
> + const float x8_7 = 0.138617169199091*x0_8 - 1.407403737526380*x0_f;
> + const float xa_0 = x8_0 + x8_3;
> + const float xa_1 = x8_1 + x8_2;
> + const float xa_2 = x8_0 - x8_3;
> + const float xa_3 = x8_1 - x8_2;
> + const float xb_0 = xa_0 + xa_1;
> + const float xb_1 = xa_0 - xa_1;
> + const float xb_2 = 1.306562964876380*xa_2 + 0.541196100146197*xa_3;
> + const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3;
> + const float xc_0 = x8_4 + x8_7;
> + const float xc_1 = x8_5 + x8_6;
> + const float xc_2 = x8_4 - x8_7;
> + const float xc_3 = x8_5 - x8_6;
> + const float xd_0 = xc_0 + xc_1;
> + const float xd_1 = xc_0 - xc_1;
> + const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3;
> + const float xd_3 = 0.541196100146197*xc_2 - 1.306562964876380*xc_3;
> + const float x1_9 = 0.707106781186547*xb_2 - 0.707106781186547*xd_3;
> + const float x1_a = 0.707106781186547*xb_2 + 0.707106781186547*xd_3;
> + const float x1_b = 0.707106781186547*xb_1 + 0.707106781186547*xd_1;
> + const float x1_c = 0.707106781186547*xb_1 - 0.707106781186547*xd_1;
> + const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2;
> + const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2;
> + dst[ 0*dst_stridea] = 0.25*x5_0;
> + dst[ 1*dst_stridea] = 0.25*xb_0;
> + dst[ 2*dst_stridea] = 0.25*x7_0;
> + dst[ 3*dst_stridea] = 0.25*x1_9;
> + dst[ 4*dst_stridea] = 0.25*x5_2;
> + dst[ 5*dst_stridea] = 0.25*x1_a;
> + dst[ 6*dst_stridea] = 0.25*x3_5;
> + dst[ 7*dst_stridea] = 0.25*x1_b;
> + dst[ 8*dst_stridea] = 0.25*x5_1;
> + dst[ 9*dst_stridea] = 0.25*x1_c;
> + dst[10*dst_stridea] = 0.25*x3_6;
> + dst[11*dst_stridea] = 0.25*x1_d;
> + dst[12*dst_stridea] = 0.25*x5_3;
> + dst[13*dst_stridea] = 0.25*x1_e;
> + dst[14*dst_stridea] = 0.25*x7_2;
> + dst[15*dst_stridea] = 0.25*xd_0;
many of these multiplies look like they can be merged into other
multiplies
for example see:
const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3;
const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3;
const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2;
const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2;
dst[11*dst_stridea] = 0.25*x1_d;
dst[13*dst_stridea] = 0.25*x1_e;
vs.
const float xd_2 = (0.25*0.707106781186547*1.306562964876380)*xc_2 + (0.25*0.707106781186547*0.541196100146197)*xc_3;
const float xb_3 = (0.25*0.707106781186547*0.541196100146197)*xa_2 - (0.25*0.707106781186547*1.306562964876380)*xa_3;
dst[11*dst_stridea] = xb_3 - xd_2;
dst[13*dst_stridea] = xb_3 + xd_2;
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
It is what and why we do it that matters, not just one of them.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140804/9ca32c44/attachment.asc>
More information about the ffmpeg-devel
mailing list