[FFmpeg-devel] [PATCH 2/3] avfilter/avf_showcqt: cqt_calc x86 optimization
Muhammad Faiz
mfcc64 at gmail.com
Thu Mar 10 10:53:14 CET 2016
use intrinsic
cqt_time:
plain = 3.286 s
SSE = 1.725 s
SSE3 = 1.692 s
AVX = 1.399 s
Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
---
libavfilter/avf_showcqt.c | 7 +
libavfilter/avf_showcqt.h | 4 +
libavfilter/x86/Makefile | 1 +
libavfilter/x86/avf_showcqt.c | 289 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 301 insertions(+)
create mode 100644 libavfilter/x86/avf_showcqt.c
diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c
index 8928bfb..2d2644c 100644
--- a/libavfilter/avf_showcqt.c
+++ b/libavfilter/avf_showcqt.c
@@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
w *= sign * (1.0 / s->fft_len);
s->coeffs[m].val[x - s->coeffs[m].start] = w;
}
+
+ if (s->permute_coeffs)
+ s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
}
av_expr_free(expr);
@@ -1189,6 +1192,10 @@ static int config_output(AVFilterLink *outlink)
s->update_sono = update_sono_yuv;
}
+ /* arch specific initialization */
+ if (ARCH_X86)
+ ff_showcqt_init_x86(s);
+
if ((ret = init_cqt(s)) < 0)
return ret;
diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h
index b945f49..d01d90a 100644
--- a/libavfilter/avf_showcqt.h
+++ b/libavfilter/avf_showcqt.h
@@ -79,6 +79,8 @@ typedef struct {
void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
void (*draw_sono)(AVFrame *out, AVFrame *sono, int off, int idx);
void (*update_sono)(AVFrame *sono, const ColorFloat *c, int idx);
+ /* permute callback, for easier SIMD code */
+ void (*permute_coeffs)(float *val, int len);
/* performance debugging */
int64_t fft_time;
int64_t cqt_time;
@@ -112,4 +114,6 @@ typedef struct {
int axis;
} ShowCQTContext;
+void ff_showcqt_init_x86(ShowCQTContext *s);
+
#endif
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 33de380..9633a7f 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -11,6 +11,7 @@ OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o
+OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o
OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o
OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o
OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o
diff --git a/libavfilter/x86/avf_showcqt.c b/libavfilter/x86/avf_showcqt.c
new file mode 100644
index 0000000..b8e9d32
--- /dev/null
+++ b/libavfilter/x86/avf_showcqt.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016 Muhammad Faiz <mfcc64 at gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/intrinsic.h"
+#include "libavfilter/avf_showcqt.h"
+
+#if HAVE_SSE_INTRINSIC
+#include <xmmintrin.h>
+#endif
+
+#if HAVE_SSE3_INTRINSIC
+#include <pmmintrin.h>
+#endif
+
+#if HAVE_AVX_INTRINSIC
+#include <immintrin.h>
+#endif
+
+#define CALCULATE(z) \
+do { \
+ u = _mm_load_ps(coeffs[k+z].val + x); \
+ i = coeffs[k+z].start + x; \
+ j = fft_len - i; \
+ m = _mm_load_ps(&src[i].re); \
+ n = _mm_load_ps(&src[i+2].re); \
+ m_re = _mm_shuffle_ps(m, n, _MM_SHUFFLE(2,0,2,0)); \
+ m_im = _mm_shuffle_ps(m, n, _MM_SHUFFLE(3,1,3,1)); \
+ m = _mm_loadu_ps(&src[j-1].re); \
+ n = _mm_loadu_ps(&src[j-3].re); \
+ n_re = _mm_shuffle_ps(m, n, _MM_SHUFFLE(0,2,0,2)); \
+ n_im = _mm_shuffle_ps(m, n, _MM_SHUFFLE(1,3,1,3)); \
+ a_re[z] = _mm_add_ps(a_re[z], _mm_mul_ps(u, m_re)); \
+ a_im[z] = _mm_add_ps(a_im[z], _mm_mul_ps(u, m_im)); \
+ b_re[z] = _mm_add_ps(b_re[z], _mm_mul_ps(u, n_re)); \
+ b_im[z] = _mm_add_ps(b_im[z], _mm_mul_ps(u, n_im)); \
+} while (0)
+
+#if HAVE_SSE_INTRINSIC
+static av_intrinsic_sse
+void cqt_calc_sse(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
+ int len, int fft_len)
+{
+ int k, x, i, j, coeffs_len;
+ __m128 result[2];
+ __m128 l_re[2], l_im[2];
+ __m128 r_re[2], r_im[2];
+ __m128 a_re[2], a_im[2];
+ __m128 b_re[2], b_im[2];
+ __m128 m, n;
+ __m128 m_re, m_im;
+ __m128 n_re, n_im;
+ __m128 u;
+
+ for (k = 0; k < len; k += 2) {
+ a_re[0] = a_re[1] = a_im[0] = a_im[1] = _mm_setzero_ps();
+ b_re[0] = b_re[1] = b_im[0] = b_im[1] = _mm_setzero_ps();
+
+ coeffs_len = FFMIN(coeffs[k].len, coeffs[k+1].len);
+ for (x = 0; x < coeffs_len; x += 4) {
+ CALCULATE(0);
+ CALCULATE(1);
+ }
+
+ coeffs_len = coeffs[k].len;
+ for ( ; x < coeffs_len; x += 4)
+ CALCULATE(0);
+
+ coeffs_len = coeffs[k+1].len;
+ for ( ; x < coeffs_len; x += 4)
+ CALCULATE(1);
+
+ /* separate left and right, (and multiply by 2.0) */
+#define SEPARATE(z) \
+do { \
+ l_re[z] = _mm_add_ps(a_re[z], b_re[z]); \
+ l_im[z] = _mm_sub_ps(a_im[z], b_im[z]); \
+ r_re[z] = _mm_add_ps(b_im[z], a_im[z]); \
+ r_im[z] = _mm_sub_ps(b_re[z], a_re[z]); \
+ m = _mm_shuffle_ps(l_re[z], l_im[z], _MM_SHUFFLE(2,0,2,0)); \
+ n = _mm_shuffle_ps(l_re[z], l_im[z], _MM_SHUFFLE(3,1,3,1)); \
+ l_re[z] = _mm_add_ps(m, n); \
+ m = _mm_shuffle_ps(r_re[z], r_im[z], _MM_SHUFFLE(2,0,2,0)); \
+ n = _mm_shuffle_ps(r_re[z], r_im[z], _MM_SHUFFLE(3,1,3,1)); \
+ r_re[z] = _mm_add_ps(m, n); \
+ m = _mm_shuffle_ps(l_re[z], r_re[z], _MM_SHUFFLE(2,0,2,0)); \
+ n = _mm_shuffle_ps(l_re[z], r_re[z], _MM_SHUFFLE(3,1,3,1)); \
+ l_re[z] = _mm_add_ps(m, n); \
+ result[z] = _mm_mul_ps(l_re[z], l_re[z]); \
+} while (0)
+ SEPARATE(0);
+ SEPARATE(1);
+#undef SEPARATE
+ m = _mm_shuffle_ps(result[0], result[1], _MM_SHUFFLE(2,0,2,0));
+ n = _mm_shuffle_ps(result[0], result[1], _MM_SHUFFLE(3,1,3,1));
+ _mm_store_ps(&dst[k].re, _mm_add_ps(m, n));
+ }
+}
+#endif
+
+#if HAVE_SSE3_INTRINSIC
+static av_intrinsic_sse3
+void cqt_calc_sse3(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
+ int len, int fft_len)
+{
+ int k, x, i, j, coeffs_len;
+ __m128 result[2];
+ __m128 l_re[2], l_im[2];
+ __m128 r_re[2], r_im[2];
+ __m128 a_re[2], a_im[2];
+ __m128 b_re[2], b_im[2];
+ __m128 m, n;
+ __m128 m_re, m_im;
+ __m128 n_re, n_im;
+ __m128 u;
+
+ for (k = 0; k < len; k += 2) {
+ a_re[0] = a_re[1] = a_im[0] = a_im[1] = _mm_setzero_ps();
+ b_re[0] = b_re[1] = b_im[0] = b_im[1] = _mm_setzero_ps();
+
+ coeffs_len = FFMIN(coeffs[k].len, coeffs[k+1].len);
+ for (x = 0; x < coeffs_len; x += 4) {
+ CALCULATE(0);
+ CALCULATE(1);
+ }
+
+ coeffs_len = coeffs[k].len;
+ for ( ; x < coeffs_len; x += 4)
+ CALCULATE(0);
+
+ coeffs_len = coeffs[k+1].len;
+ for ( ; x < coeffs_len; x += 4)
+ CALCULATE(1);
+
+ /* separate left and right, (and multiply by 2.0) */
+#define SEPARATE(z) \
+do { \
+ l_re[z] = _mm_add_ps(a_re[z], b_re[z]); \
+ l_im[z] = _mm_sub_ps(a_im[z], b_im[z]); \
+ r_re[z] = _mm_add_ps(b_im[z], a_im[z]); \
+ r_im[z] = _mm_sub_ps(b_re[z], a_re[z]); \
+ l_re[z] = _mm_hadd_ps(l_re[z], l_im[z]); \
+ r_re[z] = _mm_hadd_ps(r_re[z], r_im[z]); \
+ l_re[z] = _mm_hadd_ps(l_re[z], r_re[z]); \
+ result[z] = _mm_mul_ps(l_re[z], l_re[z]); \
+} while (0)
+ SEPARATE(0);
+ SEPARATE(1);
+#undef SEPARATE
+ _mm_store_ps(&dst[k].re, _mm_hadd_ps(result[0], result[1]));
+ }
+}
+#endif
+
+#undef CALCULATE
+
+#if HAVE_AVX_INTRINSIC
+static av_intrinsic_avx
+void cqt_calc_avx(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
+ int len, int fft_len)
+{
+ int k, x, i, j, coeffs_len;
+ __m128 result[2];
+ __m256 l_re[2], l_im[2];
+ __m256 r_re[2], r_im[2];
+ __m256 a_re[2], a_im[2];
+ __m256 b_re[2], b_im[2];
+ __m256 m, n;
+ __m256 m_re, m_im;
+ __m256 n_re, n_im;
+ __m256 u;
+
+#define CALCULATE(z) \
+do { \
+ u = _mm256_load_ps(coeffs[k+z].val + x); \
+ i = coeffs[k+z].start + x; \
+ j = fft_len - i; \
+ m = _mm256_load_ps(&src[i].re); \
+ n = _mm256_load_ps(&src[i+4].re); \
+ m_re = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(2,0,2,0)); \
+ m_im = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(3,1,3,1)); \
+ m = _mm256_loadu_ps(&src[j-3].re); \
+ m = _mm256_permute2f128_ps(m, m, _MM_SHUFFLE2(0, 1)); \
+ n = _mm256_loadu_ps(&src[j-7].re); \
+ n = _mm256_permute2f128_ps(n, n, _MM_SHUFFLE2(0, 1)); \
+ n_re = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(0,2,0,2)); \
+ n_im = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(1,3,1,3)); \
+ a_re[z] = _mm256_add_ps(a_re[z], _mm256_mul_ps(u, m_re)); \
+ a_im[z] = _mm256_add_ps(a_im[z], _mm256_mul_ps(u, m_im)); \
+ b_re[z] = _mm256_add_ps(b_re[z], _mm256_mul_ps(u, n_re)); \
+ b_im[z] = _mm256_add_ps(b_im[z], _mm256_mul_ps(u, n_im)); \
+} while (0)
+
+ for (k = 0; k < len; k += 2) {
+ a_re[0] = a_re[1] = a_im[0] = a_im[1] = _mm256_setzero_ps();
+ b_re[0] = b_re[1] = b_im[0] = b_im[1] = _mm256_setzero_ps();
+
+ coeffs_len = FFMIN(coeffs[k].len, coeffs[k+1].len);
+ for (x = 0; x < coeffs_len; x += 8) {
+ CALCULATE(0);
+ CALCULATE(1);
+ }
+
+ coeffs_len = coeffs[k].len;
+ for ( ; x < coeffs_len; x += 8)
+ CALCULATE(0);
+
+ coeffs_len = coeffs[k+1].len;
+ for ( ; x < coeffs_len; x += 8)
+ CALCULATE(1);
+
+ /* separate left and right, (and multiply by 2.0) */
+#define SEPARATE(z) \
+do { \
+ l_re[z] = _mm256_add_ps(a_re[z], b_re[z]); \
+ l_im[z] = _mm256_sub_ps(a_im[z], b_im[z]); \
+ r_re[z] = _mm256_add_ps(b_im[z], a_im[z]); \
+ r_im[z] = _mm256_sub_ps(b_re[z], a_re[z]); \
+ l_re[z] = _mm256_hadd_ps(l_re[z], l_im[z]); \
+ r_re[z] = _mm256_hadd_ps(r_re[z], r_im[z]); \
+ l_re[z] = _mm256_hadd_ps(l_re[z], r_re[z]); \
+ result[z] = _mm_add_ps(_mm256_castps256_ps128(l_re[z]), \
+ _mm256_castps256_ps128(_mm256_permute2f128_ps(l_re[z], l_re[z], _MM_SHUFFLE2(0, 1)))); \
+ result[z] = _mm_mul_ps(result[z], result[z]); \
+} while (0)
+ SEPARATE(0);
+ SEPARATE(1);
+ _mm_store_ps(&dst[k].re, _mm_hadd_ps(result[0], result[1]));
+ }
+#undef CALCULATE
+#undef SEPARATE
+}
+
+static void permute_coeffs_avx(float *v, int len)
+{
+ int k;
+ for (k = 0; k < len; k += 8) {
+ FFSWAP(float, v[k+2], v[k+4]);
+ FFSWAP(float, v[k+3], v[k+5]);
+ }
+}
+#endif
+
+av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_SSE_INTRINSIC
+ if (cpu_flags & AV_CPU_FLAG_SSE) {
+ s->cqt_calc = cqt_calc_sse;
+ s->permute_coeffs = NULL;
+ s->cqt_align = 4;
+ }
+#endif
+
+#if HAVE_SSE3_INTRINSIC
+ if (cpu_flags & AV_CPU_FLAG_SSE3 && !(cpu_flags & AV_CPU_FLAG_SSE3SLOW)) {
+ s->cqt_calc = cqt_calc_sse3;
+ s->permute_coeffs = NULL;
+ s->cqt_align = 4;
+ }
+#endif
+
+#if HAVE_AVX_INTRINSIC
+ if (cpu_flags & AV_CPU_FLAG_AVX && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+ s->cqt_calc = cqt_calc_avx;
+ s->permute_coeffs = permute_coeffs_avx;
+ s->cqt_align = 8;
+ }
+#endif
+}
--
2.5.0
More information about the ffmpeg-devel
mailing list