[FFmpeg-devel] [PATCH 2/3] avfilter/avf_showcqt: cqt_calc x86 optimization

Thu Mar 10 10:53:14 CET 2016

use intrinsic
cqt_time:
plain = 3.286 s
SSE   = 1.725 s
SSE3  = 1.692 s
AVX   = 1.399 s

Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
---
 libavfilter/avf_showcqt.c     |   7 +
 libavfilter/avf_showcqt.h     |   4 +
 libavfilter/x86/Makefile      |   1 +
 libavfilter/x86/avf_showcqt.c | 289 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 301 insertions(+)
 create mode 100644 libavfilter/x86/avf_showcqt.c

diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c
index 8928bfb..2d2644c 100644
--- a/libavfilter/avf_showcqt.c
+++ b/libavfilter/avf_showcqt.c
@@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
             w *= sign * (1.0 / s->fft_len);
             s->coeffs[m].val[x - s->coeffs[m].start] = w;
         }
+
+        if (s->permute_coeffs)
+            s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
     }
 
     av_expr_free(expr);
@@ -1189,6 +1192,10 @@ static int config_output(AVFilterLink *outlink)
         s->update_sono = update_sono_yuv;
     }
 
+    /* arch specific initialization */
+    if (ARCH_X86)
+        ff_showcqt_init_x86(s);
+
     if ((ret = init_cqt(s)) < 0)
         return ret;
 
diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h
index b945f49..d01d90a 100644
--- a/libavfilter/avf_showcqt.h
+++ b/libavfilter/avf_showcqt.h
@@ -79,6 +79,8 @@ typedef struct {
     void                (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
     void                (*draw_sono)(AVFrame *out, AVFrame *sono, int off, int idx);
     void                (*update_sono)(AVFrame *sono, const ColorFloat *c, int idx);
+    /* permute callback, for easier SIMD code */
+    void                (*permute_coeffs)(float *val, int len);
     /* performance debugging */
     int64_t             fft_time;
     int64_t             cqt_time;
@@ -112,4 +114,6 @@ typedef struct {
     int                 axis;
 } ShowCQTContext;
 
+void ff_showcqt_init_x86(ShowCQTContext *s);
+
 #endif
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 33de380..9633a7f 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -11,6 +11,7 @@ OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
 OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
 OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
 OBJS-$(CONFIG_REMOVEGRAIN_FILTER)            += x86/vf_removegrain_init.o
+OBJS-$(CONFIG_SHOWCQT_FILTER)                += x86/avf_showcqt.o
 OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
 OBJS-$(CONFIG_SSIM_FILTER)                   += x86/vf_ssim_init.o
 OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
diff --git a/libavfilter/x86/avf_showcqt.c b/libavfilter/x86/avf_showcqt.c
new file mode 100644
index 0000000..b8e9d32
--- /dev/null
+++ b/libavfilter/x86/avf_showcqt.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016 Muhammad Faiz <mfcc64 at gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/intrinsic.h"
+#include "libavfilter/avf_showcqt.h"
+
+#if HAVE_SSE_INTRINSIC
+#include <xmmintrin.h>
+#endif
+
+#if HAVE_SSE3_INTRINSIC
+#include <pmmintrin.h>
+#endif
+
+#if HAVE_AVX_INTRINSIC
+#include <immintrin.h>
+#endif
+
+#define CALCULATE(z) \
+do { \
+    u = _mm_load_ps(coeffs[k+z].val + x); \
+    i = coeffs[k+z].start + x; \
+    j = fft_len - i; \
+    m = _mm_load_ps(&src[i].re); \
+    n = _mm_load_ps(&src[i+2].re); \
+    m_re = _mm_shuffle_ps(m, n, _MM_SHUFFLE(2,0,2,0)); \
+    m_im = _mm_shuffle_ps(m, n, _MM_SHUFFLE(3,1,3,1)); \
+    m = _mm_loadu_ps(&src[j-1].re); \
+    n = _mm_loadu_ps(&src[j-3].re); \
+    n_re = _mm_shuffle_ps(m, n, _MM_SHUFFLE(0,2,0,2)); \
+    n_im = _mm_shuffle_ps(m, n, _MM_SHUFFLE(1,3,1,3)); \
+    a_re[z] = _mm_add_ps(a_re[z], _mm_mul_ps(u, m_re)); \
+    a_im[z] = _mm_add_ps(a_im[z], _mm_mul_ps(u, m_im)); \
+    b_re[z] = _mm_add_ps(b_re[z], _mm_mul_ps(u, n_re)); \
+    b_im[z] = _mm_add_ps(b_im[z], _mm_mul_ps(u, n_im)); \
+} while (0)
+
+#if HAVE_SSE_INTRINSIC
+static av_intrinsic_sse
+void cqt_calc_sse(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
+                  int len, int fft_len)
+{
+    int k, x, i, j, coeffs_len;
+    __m128 result[2];
+    __m128 l_re[2], l_im[2];
+    __m128 r_re[2], r_im[2];
+    __m128 a_re[2], a_im[2];
+    __m128 b_re[2], b_im[2];
+    __m128 m, n;
+    __m128 m_re, m_im;
+    __m128 n_re, n_im;
+    __m128 u;
+
+    for (k = 0; k < len; k += 2) {
+        a_re[0] = a_re[1] = a_im[0] = a_im[1] = _mm_setzero_ps();
+        b_re[0] = b_re[1] = b_im[0] = b_im[1] = _mm_setzero_ps();
+
+        coeffs_len = FFMIN(coeffs[k].len, coeffs[k+1].len);
+        for (x = 0; x < coeffs_len; x += 4) {
+            CALCULATE(0);
+            CALCULATE(1);
+        }
+
+        coeffs_len = coeffs[k].len;
+        for ( ; x < coeffs_len; x += 4)
+            CALCULATE(0);
+
+        coeffs_len = coeffs[k+1].len;
+        for ( ; x < coeffs_len; x += 4)
+            CALCULATE(1);
+
+        /* separate left and right, (and multiply by 2.0) */
+#define SEPARATE(z) \
+do { \
+    l_re[z] = _mm_add_ps(a_re[z], b_re[z]); \
+    l_im[z] = _mm_sub_ps(a_im[z], b_im[z]); \
+    r_re[z] = _mm_add_ps(b_im[z], a_im[z]); \
+    r_im[z] = _mm_sub_ps(b_re[z], a_re[z]); \
+    m = _mm_shuffle_ps(l_re[z], l_im[z], _MM_SHUFFLE(2,0,2,0)); \
+    n = _mm_shuffle_ps(l_re[z], l_im[z], _MM_SHUFFLE(3,1,3,1)); \
+    l_re[z] = _mm_add_ps(m, n); \
+    m = _mm_shuffle_ps(r_re[z], r_im[z], _MM_SHUFFLE(2,0,2,0)); \
+    n = _mm_shuffle_ps(r_re[z], r_im[z], _MM_SHUFFLE(3,1,3,1)); \
+    r_re[z] = _mm_add_ps(m, n); \
+    m = _mm_shuffle_ps(l_re[z], r_re[z], _MM_SHUFFLE(2,0,2,0)); \
+    n = _mm_shuffle_ps(l_re[z], r_re[z], _MM_SHUFFLE(3,1,3,1)); \
+    l_re[z] = _mm_add_ps(m, n); \
+    result[z] = _mm_mul_ps(l_re[z], l_re[z]); \
+} while (0)
+        SEPARATE(0);
+        SEPARATE(1);
+#undef SEPARATE
+        m = _mm_shuffle_ps(result[0], result[1], _MM_SHUFFLE(2,0,2,0));
+        n = _mm_shuffle_ps(result[0], result[1], _MM_SHUFFLE(3,1,3,1));
+        _mm_store_ps(&dst[k].re, _mm_add_ps(m, n));
+    }
+}
+#endif
+
+#if HAVE_SSE3_INTRINSIC
+static av_intrinsic_sse3
+void cqt_calc_sse3(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
+                   int len, int fft_len)
+{
+    int k, x, i, j, coeffs_len;
+    __m128 result[2];
+    __m128 l_re[2], l_im[2];
+    __m128 r_re[2], r_im[2];
+    __m128 a_re[2], a_im[2];
+    __m128 b_re[2], b_im[2];
+    __m128 m, n;
+    __m128 m_re, m_im;
+    __m128 n_re, n_im;
+    __m128 u;
+
+    for (k = 0; k < len; k += 2) {
+        a_re[0] = a_re[1] = a_im[0] = a_im[1] = _mm_setzero_ps();
+        b_re[0] = b_re[1] = b_im[0] = b_im[1] = _mm_setzero_ps();
+
+        coeffs_len = FFMIN(coeffs[k].len, coeffs[k+1].len);
+        for (x = 0; x < coeffs_len; x += 4) {
+            CALCULATE(0);
+            CALCULATE(1);
+        }
+
+        coeffs_len = coeffs[k].len;
+        for ( ; x < coeffs_len; x += 4)
+            CALCULATE(0);
+
+        coeffs_len = coeffs[k+1].len;
+        for ( ; x < coeffs_len; x += 4)
+            CALCULATE(1);
+
+        /* separate left and right, (and multiply by 2.0) */
+#define SEPARATE(z) \
+do { \
+    l_re[z] = _mm_add_ps(a_re[z], b_re[z]); \
+    l_im[z] = _mm_sub_ps(a_im[z], b_im[z]); \
+    r_re[z] = _mm_add_ps(b_im[z], a_im[z]); \
+    r_im[z] = _mm_sub_ps(b_re[z], a_re[z]); \
+    l_re[z] = _mm_hadd_ps(l_re[z], l_im[z]); \
+    r_re[z] = _mm_hadd_ps(r_re[z], r_im[z]); \
+    l_re[z] = _mm_hadd_ps(l_re[z], r_re[z]); \
+    result[z] = _mm_mul_ps(l_re[z], l_re[z]); \
+} while (0)
+        SEPARATE(0);
+        SEPARATE(1);
+#undef SEPARATE
+        _mm_store_ps(&dst[k].re, _mm_hadd_ps(result[0], result[1]));
+    }
+}
+#endif
+
+#undef CALCULATE
+
+#if HAVE_AVX_INTRINSIC
+static av_intrinsic_avx
+void cqt_calc_avx(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
+                  int len, int fft_len)
+{
+    int k, x, i, j, coeffs_len;
+    __m128 result[2];
+    __m256 l_re[2], l_im[2];
+    __m256 r_re[2], r_im[2];
+    __m256 a_re[2], a_im[2];
+    __m256 b_re[2], b_im[2];
+    __m256 m, n;
+    __m256 m_re, m_im;
+    __m256 n_re, n_im;
+    __m256 u;
+
+#define CALCULATE(z) \
+do { \
+    u = _mm256_load_ps(coeffs[k+z].val + x); \
+    i = coeffs[k+z].start + x; \
+    j = fft_len - i; \
+    m = _mm256_load_ps(&src[i].re); \
+    n = _mm256_load_ps(&src[i+4].re); \
+    m_re = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(2,0,2,0)); \
+    m_im = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(3,1,3,1)); \
+    m = _mm256_loadu_ps(&src[j-3].re); \
+    m = _mm256_permute2f128_ps(m, m, _MM_SHUFFLE2(0, 1)); \
+    n = _mm256_loadu_ps(&src[j-7].re); \
+    n = _mm256_permute2f128_ps(n, n, _MM_SHUFFLE2(0, 1)); \
+    n_re = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(0,2,0,2)); \
+    n_im = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(1,3,1,3)); \
+    a_re[z] = _mm256_add_ps(a_re[z], _mm256_mul_ps(u, m_re)); \
+    a_im[z] = _mm256_add_ps(a_im[z], _mm256_mul_ps(u, m_im)); \
+    b_re[z] = _mm256_add_ps(b_re[z], _mm256_mul_ps(u, n_re)); \
+    b_im[z] = _mm256_add_ps(b_im[z], _mm256_mul_ps(u, n_im)); \
+} while (0)
+
+    for (k = 0; k < len; k += 2) {
+        a_re[0] = a_re[1] = a_im[0] = a_im[1] = _mm256_setzero_ps();
+        b_re[0] = b_re[1] = b_im[0] = b_im[1] = _mm256_setzero_ps();
+
+        coeffs_len = FFMIN(coeffs[k].len, coeffs[k+1].len);
+        for (x = 0; x < coeffs_len; x += 8) {
+            CALCULATE(0);
+            CALCULATE(1);
+        }
+
+        coeffs_len = coeffs[k].len;
+        for ( ; x < coeffs_len; x += 8)
+            CALCULATE(0);
+
+        coeffs_len = coeffs[k+1].len;
+        for ( ; x < coeffs_len; x += 8)
+            CALCULATE(1);
+
+        /* separate left and right, (and multiply by 2.0) */
+#define SEPARATE(z) \
+do { \
+    l_re[z] = _mm256_add_ps(a_re[z], b_re[z]); \
+    l_im[z] = _mm256_sub_ps(a_im[z], b_im[z]); \
+    r_re[z] = _mm256_add_ps(b_im[z], a_im[z]); \
+    r_im[z] = _mm256_sub_ps(b_re[z], a_re[z]); \
+    l_re[z] = _mm256_hadd_ps(l_re[z], l_im[z]); \
+    r_re[z] = _mm256_hadd_ps(r_re[z], r_im[z]); \
+    l_re[z] = _mm256_hadd_ps(l_re[z], r_re[z]); \
+    result[z] = _mm_add_ps(_mm256_castps256_ps128(l_re[z]), \
+                _mm256_castps256_ps128(_mm256_permute2f128_ps(l_re[z], l_re[z], _MM_SHUFFLE2(0, 1)))); \
+    result[z] = _mm_mul_ps(result[z], result[z]); \
+} while (0)
+        SEPARATE(0);
+        SEPARATE(1);
+        _mm_store_ps(&dst[k].re, _mm_hadd_ps(result[0], result[1]));
+    }
+#undef CALCULATE
+#undef SEPARATE
+}
+
+static void permute_coeffs_avx(float *v, int len)
+{
+    int k;
+    for (k = 0; k < len; k += 8) {
+        FFSWAP(float, v[k+2], v[k+4]);
+        FFSWAP(float, v[k+3], v[k+5]);
+    }
+}
+#endif
+
+av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_SSE_INTRINSIC
+    if (cpu_flags & AV_CPU_FLAG_SSE) {
+        s->cqt_calc = cqt_calc_sse;
+        s->permute_coeffs = NULL;
+        s->cqt_align = 4;
+    }
+#endif
+
+#if HAVE_SSE3_INTRINSIC
+    if (cpu_flags & AV_CPU_FLAG_SSE3 && !(cpu_flags & AV_CPU_FLAG_SSE3SLOW)) {
+        s->cqt_calc = cqt_calc_sse3;
+        s->permute_coeffs = NULL;
+        s->cqt_align = 4;
+    }
+#endif
+
+#if HAVE_AVX_INTRINSIC
+    if (cpu_flags & AV_CPU_FLAG_AVX && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+        s->cqt_calc = cqt_calc_avx;
+        s->permute_coeffs = permute_coeffs_avx;
+        s->cqt_align = 8;
+    }
+#endif
+}
-- 
2.5.0